]> CyberLeo.Net >> Repos - FreeBSD/stable/8.git/blob - contrib/bind9/lib/isc/unix/socket.c
MFC r362623:
[FreeBSD/stable/8.git] / contrib / bind9 / lib / isc / unix / socket.c
1 /*
2  * Copyright (C) 2004-2014  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1998-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id$ */
19
20 /*! \file */
21
22 #include <config.h>
23
24 #include <sys/param.h>
25 #include <sys/types.h>
26 #include <sys/socket.h>
27 #include <sys/stat.h>
28 #include <sys/time.h>
29 #include <sys/uio.h>
30
31 #include <errno.h>
32 #include <fcntl.h>
33 #include <stddef.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37
38 #include <isc/buffer.h>
39 #include <isc/bufferlist.h>
40 #include <isc/condition.h>
41 #include <isc/formatcheck.h>
42 #include <isc/list.h>
43 #include <isc/log.h>
44 #include <isc/mem.h>
45 #include <isc/msgs.h>
46 #include <isc/mutex.h>
47 #include <isc/net.h>
48 #include <isc/once.h>
49 #include <isc/platform.h>
50 #include <isc/print.h>
51 #include <isc/region.h>
52 #include <isc/socket.h>
53 #include <isc/stats.h>
54 #include <isc/strerror.h>
55 #include <isc/task.h>
56 #include <isc/thread.h>
57 #include <isc/util.h>
58 #include <isc/xml.h>
59
60 #ifdef ISC_PLATFORM_HAVESYSUNH
61 #include <sys/un.h>
62 #endif
63 #ifdef ISC_PLATFORM_HAVEKQUEUE
64 #include <sys/event.h>
65 #endif
66 #ifdef ISC_PLATFORM_HAVEEPOLL
67 #include <sys/epoll.h>
68 #endif
69 #ifdef ISC_PLATFORM_HAVEDEVPOLL
70 #if defined(HAVE_SYS_DEVPOLL_H)
71 #include <sys/devpoll.h>
72 #elif defined(HAVE_DEVPOLL_H)
73 #include <devpoll.h>
74 #endif
75 #endif
76
77 #include "errno2result.h"
78
79 /* See task.c about the following definition: */
80 #ifdef BIND9
81 #ifdef ISC_PLATFORM_USETHREADS
82 #define USE_WATCHER_THREAD
83 #else
84 #define USE_SHARED_MANAGER
85 #endif  /* ISC_PLATFORM_USETHREADS */
86 #endif  /* BIND9 */
87
88 #ifndef USE_WATCHER_THREAD
89 #include "socket_p.h"
90 #include "../task_p.h"
91 #endif /* USE_WATCHER_THREAD */
92
93 #if defined(SO_BSDCOMPAT) && defined(__linux__)
94 #include <sys/utsname.h>
95 #endif
96
97 /*%
98  * Choose the most preferable multiplex method.
99  */
100 #ifdef ISC_PLATFORM_HAVEKQUEUE
101 #define USE_KQUEUE
102 #elif defined (ISC_PLATFORM_HAVEEPOLL)
103 #define USE_EPOLL
104 #elif defined (ISC_PLATFORM_HAVEDEVPOLL)
105 #define USE_DEVPOLL
106 typedef struct {
107         unsigned int want_read : 1,
108                 want_write : 1;
109 } pollinfo_t;
110 #else
111 #define USE_SELECT
112 #endif  /* ISC_PLATFORM_HAVEKQUEUE */
113
114 #ifndef USE_WATCHER_THREAD
115 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
116 struct isc_socketwait {
117         int nevents;
118 };
119 #elif defined (USE_SELECT)
120 struct isc_socketwait {
121         fd_set *readset;
122         fd_set *writeset;
123         int nfds;
124         int maxfd;
125 };
126 #endif  /* USE_KQUEUE */
127 #endif /* !USE_WATCHER_THREAD */
128
129 /*%
130  * Maximum number of allowable open sockets.  This is also the maximum
131  * allowable socket file descriptor.
132  *
133  * Care should be taken before modifying this value for select():
134  * The API standard doesn't ensure select() accept more than (the system default
135  * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
136  * the vast majority of cases.  This constant should therefore be increased only
137  * when absolutely necessary and possible, i.e., the server is exhausting all
138  * available file descriptors (up to FD_SETSIZE) and the select() function
139  * and FD_xxx macros support larger values than FD_SETSIZE (which may not
140  * always by true, but we keep using some of them to ensure as much
141  * portability as possible).  Note also that overall server performance
142  * may be rather worsened with a larger value of this constant due to
143  * inherent scalability problems of select().
144  *
145  * As a special note, this value shouldn't have to be touched if
146  * this is a build for an authoritative only DNS server.
147  */
148 #ifndef ISC_SOCKET_MAXSOCKETS
149 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
150 #define ISC_SOCKET_MAXSOCKETS 4096
151 #elif defined(USE_SELECT)
152 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
153 #endif  /* USE_KQUEUE... */
154 #endif  /* ISC_SOCKET_MAXSOCKETS */
155
156 #ifdef USE_SELECT
157 /*%
158  * Mac OS X needs a special definition to support larger values in select().
159  * We always define this because a larger value can be specified run-time.
160  */
161 #ifdef __APPLE__
162 #define _DARWIN_UNLIMITED_SELECT
163 #endif  /* __APPLE__ */
164 #endif  /* USE_SELECT */
165
166 #ifdef ISC_SOCKET_USE_POLLWATCH
167 /*%
168  * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
169  * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
170  * some of the specified FD.  The idea is based on the observation that it's
171  * likely for a busy server to keep receiving packets.  It specifically works
172  * as follows: the socket watcher is first initialized with the state of
173  * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
174  * event occurs.  When it wakes up for a socket I/O event, it moves to the
175  * poll_active state, and sets the poll timeout to a short period
176  * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
177  * watcher goes to the poll_checking state with the same timeout period.
178  * In this state, the watcher tries to detect whether this is a break
179  * during intermittent events or the kernel bug is triggered.  If the next
180  * polling reports an event within the short period, the previous timeout is
181  * likely to be a kernel bug, and so the watcher goes back to the active state.
182  * Otherwise, it moves to the idle state again.
183  *
184  * It's not clear whether this is a thread-related bug, but since we've only
185  * seen this with threads, this workaround is used only when enabling threads.
186  */
187
188 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
189
190 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
191 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
192 #endif  /* ISC_SOCKET_POLLWATCH_TIMEOUT */
193 #endif  /* ISC_SOCKET_USE_POLLWATCH */
194
195 /*%
196  * Size of per-FD lock buckets.
197  */
198 #ifdef ISC_PLATFORM_USETHREADS
199 #define FDLOCK_COUNT            1024
200 #define FDLOCK_ID(fd)           ((fd) % FDLOCK_COUNT)
201 #else
202 #define FDLOCK_COUNT            1
203 #define FDLOCK_ID(fd)           0
204 #endif  /* ISC_PLATFORM_USETHREADS */
205
206 /*%
207  * Maximum number of events communicated with the kernel.  There should normally
208  * be no need for having a large number.
209  */
210 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
211 #ifndef ISC_SOCKET_MAXEVENTS
212 #define ISC_SOCKET_MAXEVENTS    64
213 #endif
214 #endif
215
216 /*%
217  * Some systems define the socket length argument as an int, some as size_t,
218  * some as socklen_t.  This is here so it can be easily changed if needed.
219  */
220 #ifndef ISC_SOCKADDR_LEN_T
221 #define ISC_SOCKADDR_LEN_T unsigned int
222 #endif
223
224 /*%
225  * Define what the possible "soft" errors can be.  These are non-fatal returns
226  * of various network related functions, like recv() and so on.
227  *
228  * For some reason, BSDI (and perhaps others) will sometimes return <0
229  * from recv() but will have errno==0.  This is broken, but we have to
230  * work around it here.
231  */
232 #define SOFT_ERROR(e)   ((e) == EAGAIN || \
233                          (e) == EWOULDBLOCK || \
234                          (e) == EINTR || \
235                          (e) == 0)
236
237 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
238
239 /*!<
240  * DLVL(90)  --  Function entry/exit and other tracing.
241  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
242  * DLVL(60)  --  Socket data send/receive
243  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
244  * DLVL(20)  --  Socket creation/destruction.
245  */
246 #define TRACE_LEVEL             90
247 #define CORRECTNESS_LEVEL       70
248 #define IOEVENT_LEVEL           60
249 #define EVENT_LEVEL             50
250 #define CREATION_LEVEL          20
251
252 #define TRACE           DLVL(TRACE_LEVEL)
253 #define CORRECTNESS     DLVL(CORRECTNESS_LEVEL)
254 #define IOEVENT         DLVL(IOEVENT_LEVEL)
255 #define EVENT           DLVL(EVENT_LEVEL)
256 #define CREATION        DLVL(CREATION_LEVEL)
257
258 typedef isc_event_t intev_t;
259
260 #define SOCKET_MAGIC            ISC_MAGIC('I', 'O', 'i', 'o')
261 #define VALID_SOCKET(s)         ISC_MAGIC_VALID(s, SOCKET_MAGIC)
262
263 /*!
264  * IPv6 control information.  If the socket is an IPv6 socket we want
265  * to collect the destination address and interface so the client can
266  * set them on outgoing packets.
267  */
268 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
269 #ifndef USE_CMSG
270 #define USE_CMSG        1
271 #endif
272 #endif
273
274 /*%
275  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
276  * a setsockopt() like interface to request timestamps, and if the OS
277  * doesn't do it for us, call gettimeofday() on every UDP receive?
278  */
279 #ifdef SO_TIMESTAMP
280 #ifndef USE_CMSG
281 #define USE_CMSG        1
282 #endif
283 #endif
284
285 /*%
286  * The size to raise the receive buffer to (from BIND 8).
287  */
288 #define RCVBUFSIZE (32*1024)
289
290 /*%
291  * The number of times a send operation is repeated if the result is EINTR.
292  */
293 #define NRETRIES 10
294
295 typedef struct isc__socket isc__socket_t;
296 typedef struct isc__socketmgr isc__socketmgr_t;
297
298 #define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)
299
300 struct isc__socket {
301         /* Not locked. */
302         isc_socket_t            common;
303         isc__socketmgr_t        *manager;
304         isc_mutex_t             lock;
305         isc_sockettype_t        type;
306         const isc_statscounter_t        *statsindex;
307
308         /* Locked by socket lock. */
309         ISC_LINK(isc__socket_t) link;
310         unsigned int            references;
311         int                     fd;
312         int                     pf;
313         char                            name[16];
314         void *                          tag;
315
316         ISC_LIST(isc_socketevent_t)             send_list;
317         ISC_LIST(isc_socketevent_t)             recv_list;
318         ISC_LIST(isc_socket_newconnev_t)        accept_list;
319         isc_socket_connev_t                    *connect_ev;
320
321         /*
322          * Internal events.  Posted when a descriptor is readable or
323          * writable.  These are statically allocated and never freed.
324          * They will be set to non-purgable before use.
325          */
326         intev_t                 readable_ev;
327         intev_t                 writable_ev;
328
329         isc_sockaddr_t          peer_address;  /* remote address */
330
331         unsigned int            pending_recv : 1,
332                                 pending_send : 1,
333                                 pending_accept : 1,
334                                 listener : 1, /* listener socket */
335                                 connected : 1,
336                                 connecting : 1, /* connect pending */
337                                 bound : 1; /* bound to local addr */
338
339 #ifdef ISC_NET_RECVOVERFLOW
340         unsigned char           overflow; /* used for MSG_TRUNC fake */
341 #endif
342
343         char                    *recvcmsgbuf;
344         ISC_SOCKADDR_LEN_T      recvcmsgbuflen;
345         char                    *sendcmsgbuf;
346         ISC_SOCKADDR_LEN_T      sendcmsgbuflen;
347
348         void                    *fdwatcharg;
349         isc_sockfdwatch_t       fdwatchcb;
350         int                     fdwatchflags;
351         isc_task_t              *fdwatchtask;
352 };
353
354 #define SOCKET_MANAGER_MAGIC    ISC_MAGIC('I', 'O', 'm', 'g')
355 #define VALID_MANAGER(m)        ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
356
357 struct isc__socketmgr {
358         /* Not locked. */
359         isc_socketmgr_t         common;
360         isc_mem_t              *mctx;
361         isc_mutex_t             lock;
362         isc_mutex_t             *fdlock;
363         isc_stats_t             *stats;
364 #ifdef USE_KQUEUE
365         int                     kqueue_fd;
366         int                     nevents;
367         struct kevent           *events;
368 #endif  /* USE_KQUEUE */
369 #ifdef USE_EPOLL
370         int                     epoll_fd;
371         int                     nevents;
372         struct epoll_event      *events;
373 #endif  /* USE_EPOLL */
374 #ifdef USE_DEVPOLL
375         int                     devpoll_fd;
376         int                     nevents;
377         struct pollfd           *events;
378 #endif  /* USE_DEVPOLL */
379 #ifdef USE_SELECT
380         int                     fd_bufsize;
381 #endif  /* USE_SELECT */
382         unsigned int            maxsocks;
383 #ifdef ISC_PLATFORM_USETHREADS
384         int                     pipe_fds[2];
385 #endif
386
387         /* Locked by fdlock. */
388         isc__socket_t          **fds;
389         int                     *fdstate;
390 #ifdef USE_DEVPOLL
391         pollinfo_t              *fdpollinfo;
392 #endif
393
394         /* Locked by manager lock. */
395         ISC_LIST(isc__socket_t) socklist;
396 #ifdef USE_SELECT
397         fd_set                  *read_fds;
398         fd_set                  *read_fds_copy;
399         fd_set                  *write_fds;
400         fd_set                  *write_fds_copy;
401         int                     maxfd;
402 #endif  /* USE_SELECT */
403         int                     reserved;       /* unlocked */
404 #ifdef USE_WATCHER_THREAD
405         isc_thread_t            watcher;
406         isc_condition_t         shutdown_ok;
407 #else /* USE_WATCHER_THREAD */
408         unsigned int            refs;
409 #endif /* USE_WATCHER_THREAD */
410         int                     maxudp;
411 };
412
413 #ifdef USE_SHARED_MANAGER
414 static isc__socketmgr_t *socketmgr = NULL;
415 #endif /* USE_SHARED_MANAGER */
416
417 #define CLOSED                  0       /* this one must be zero */
418 #define MANAGED                 1
419 #define CLOSE_PENDING           2
420
421 /*
422  * send() and recv() iovec counts
423  */
424 #define MAXSCATTERGATHER_SEND   (ISC_SOCKET_MAXSCATTERGATHER)
425 #ifdef ISC_NET_RECVOVERFLOW
426 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER + 1)
427 #else
428 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER)
429 #endif
430
431 static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
432 static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
433 static void free_socket(isc__socket_t **);
434 static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
435                                     isc__socket_t **);
436 static void destroy(isc__socket_t **);
437 static void internal_accept(isc_task_t *, isc_event_t *);
438 static void internal_connect(isc_task_t *, isc_event_t *);
439 static void internal_recv(isc_task_t *, isc_event_t *);
440 static void internal_send(isc_task_t *, isc_event_t *);
441 static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
442 static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
443 static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
444 static void build_msghdr_send(isc__socket_t *, isc_socketevent_t *,
445                               struct msghdr *, struct iovec *, size_t *);
446 static void build_msghdr_recv(isc__socket_t *, isc_socketevent_t *,
447                               struct msghdr *, struct iovec *, size_t *);
448 #ifdef USE_WATCHER_THREAD
449 static isc_boolean_t process_ctlfd(isc__socketmgr_t *manager);
450 #endif
451
452 /*%
453  * The following can be either static or public, depending on build environment.
454  */
455
456 #ifdef BIND9
457 #define ISC_SOCKETFUNC_SCOPE
458 #else
459 #define ISC_SOCKETFUNC_SCOPE static
460 #endif
461
462 ISC_SOCKETFUNC_SCOPE isc_result_t
463 isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
464                    isc_socket_t **socketp);
465 ISC_SOCKETFUNC_SCOPE void
466 isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
467 ISC_SOCKETFUNC_SCOPE void
468 isc__socket_detach(isc_socket_t **socketp);
469 ISC_SOCKETFUNC_SCOPE isc_result_t
470 isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp);
471 ISC_SOCKETFUNC_SCOPE isc_result_t
472 isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
473                        unsigned int maxsocks);
474 ISC_SOCKETFUNC_SCOPE void
475 isc__socketmgr_destroy(isc_socketmgr_t **managerp);
476 ISC_SOCKETFUNC_SCOPE isc_result_t
477 isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
478                  unsigned int minimum, isc_task_t *task,
479                   isc_taskaction_t action, const void *arg);
480 ISC_SOCKETFUNC_SCOPE isc_result_t
481 isc__socket_recv(isc_socket_t *sock, isc_region_t *region,
482                  unsigned int minimum, isc_task_t *task,
483                  isc_taskaction_t action, const void *arg);
484 ISC_SOCKETFUNC_SCOPE isc_result_t
485 isc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
486                   unsigned int minimum, isc_task_t *task,
487                   isc_socketevent_t *event, unsigned int flags);
488 ISC_SOCKETFUNC_SCOPE isc_result_t
489 isc__socket_send(isc_socket_t *sock, isc_region_t *region,
490                  isc_task_t *task, isc_taskaction_t action, const void *arg);
491 ISC_SOCKETFUNC_SCOPE isc_result_t
492 isc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
493                    isc_task_t *task, isc_taskaction_t action, const void *arg,
494                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
495 ISC_SOCKETFUNC_SCOPE isc_result_t
496 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
497                   isc_task_t *task, isc_taskaction_t action, const void *arg);
498 ISC_SOCKETFUNC_SCOPE isc_result_t
499 isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
500                     isc_task_t *task, isc_taskaction_t action, const void *arg,
501                     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
502 ISC_SOCKETFUNC_SCOPE isc_result_t
503 isc__socket_sendtov2(isc_socket_t *sock, isc_bufferlist_t *buflist,
504                      isc_task_t *task, isc_taskaction_t action, const void *arg,
505                      isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
506                      unsigned int flags);
507 ISC_SOCKETFUNC_SCOPE isc_result_t
508 isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
509                     isc_task_t *task,
510                     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
511                     isc_socketevent_t *event, unsigned int flags);
512 ISC_SOCKETFUNC_SCOPE void
513 isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active);
514 ISC_SOCKETFUNC_SCOPE isc_result_t
515 isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
516                      isc_uint32_t owner, isc_uint32_t group);
517 ISC_SOCKETFUNC_SCOPE isc_result_t
518 isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
519                  unsigned int options);
520 ISC_SOCKETFUNC_SCOPE isc_result_t
521 isc__socket_filter(isc_socket_t *sock, const char *filter);
522 ISC_SOCKETFUNC_SCOPE isc_result_t
523 isc__socket_listen(isc_socket_t *sock, unsigned int backlog);
524 ISC_SOCKETFUNC_SCOPE isc_result_t
525 isc__socket_accept(isc_socket_t *sock,
526                    isc_task_t *task, isc_taskaction_t action, const void *arg);
527 ISC_SOCKETFUNC_SCOPE isc_result_t
528 isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
529                     isc_task_t *task, isc_taskaction_t action,
530                     const void *arg);
531 ISC_SOCKETFUNC_SCOPE isc_result_t
532 isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp);
533 ISC_SOCKETFUNC_SCOPE isc_result_t
534 isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp);
535 ISC_SOCKETFUNC_SCOPE void
536 isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
537 ISC_SOCKETFUNC_SCOPE isc_sockettype_t
538 isc__socket_gettype(isc_socket_t *sock);
539 ISC_SOCKETFUNC_SCOPE isc_boolean_t
540 isc__socket_isbound(isc_socket_t *sock);
541 ISC_SOCKETFUNC_SCOPE void
542 isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes);
543 #if defined(HAVE_LIBXML2) && defined(BIND9)
544 ISC_SOCKETFUNC_SCOPE void
545 isc__socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer);
546 #endif
547
548 ISC_SOCKETFUNC_SCOPE isc_result_t
549 isc__socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
550                           isc_sockfdwatch_t callback, void *cbarg,
551                           isc_task_t *task, isc_socket_t **socketp);
552 ISC_SOCKETFUNC_SCOPE isc_result_t
553 isc__socket_fdwatchpoke(isc_socket_t *sock, int flags);
554
555 static struct {
556         isc_socketmethods_t methods;
557
558         /*%
559          * The following are defined just for avoiding unused static functions.
560          */
561 #ifndef BIND9
562         void *recvv, *send, *sendv, *sendto2, *sendtov, *cleanunix, *permunix,
563              *filter, *listen, *accept, *getpeername, *isbound;
564 #endif
565 } socketmethods = {
566         {
567                 isc__socket_attach,
568                 isc__socket_detach,
569                 isc__socket_bind,
570                 isc__socket_sendto,
571                 isc__socket_connect,
572                 isc__socket_recv,
573                 isc__socket_cancel,
574                 isc__socket_getsockname,
575                 isc__socket_gettype,
576                 isc__socket_ipv6only,
577                 isc__socket_fdwatchpoke
578         }
579 #ifndef BIND9
580         ,
581         (void *)isc__socket_recvv, (void *)isc__socket_send,
582         (void *)isc__socket_sendv, (void *)isc__socket_sendto2,
583         (void *)isc__socket_sendtov,
584         (void *)isc__socket_cleanunix, (void *)isc__socket_permunix,
585         (void *)isc__socket_filter, (void *)isc__socket_listen,
586         (void *)isc__socket_accept, (void *)isc__socket_getpeername,
587         (void *)isc__socket_isbound
588 #endif
589 };
590
591 static isc_socketmgrmethods_t socketmgrmethods = {
592         isc__socketmgr_destroy,
593         isc__socket_create,
594         isc__socket_fdwatchcreate
595 };
596
597 #define SELECT_POKE_SHUTDOWN            (-1)
598 #define SELECT_POKE_NOTHING             (-2)
599 #define SELECT_POKE_READ                (-3)
600 #define SELECT_POKE_ACCEPT              (-3) /*%< Same as _READ */
601 #define SELECT_POKE_WRITE               (-4)
602 #define SELECT_POKE_CONNECT             (-4) /*%< Same as _WRITE */
603 #define SELECT_POKE_CLOSE               (-5)
604
605 #define SOCK_DEAD(s)                    ((s)->references == 0)
606
607 /*%
608  * Shortcut index arrays to get access to statistics counters.
609  */
610 enum {
611         STATID_OPEN = 0,
612         STATID_OPENFAIL = 1,
613         STATID_CLOSE = 2,
614         STATID_BINDFAIL = 3,
615         STATID_CONNECTFAIL = 4,
616         STATID_CONNECT = 5,
617         STATID_ACCEPTFAIL = 6,
618         STATID_ACCEPT = 7,
619         STATID_SENDFAIL = 8,
620         STATID_RECVFAIL = 9
621 };
622 static const isc_statscounter_t udp4statsindex[] = {
623         isc_sockstatscounter_udp4open,
624         isc_sockstatscounter_udp4openfail,
625         isc_sockstatscounter_udp4close,
626         isc_sockstatscounter_udp4bindfail,
627         isc_sockstatscounter_udp4connectfail,
628         isc_sockstatscounter_udp4connect,
629         -1,
630         -1,
631         isc_sockstatscounter_udp4sendfail,
632         isc_sockstatscounter_udp4recvfail
633 };
634 static const isc_statscounter_t udp6statsindex[] = {
635         isc_sockstatscounter_udp6open,
636         isc_sockstatscounter_udp6openfail,
637         isc_sockstatscounter_udp6close,
638         isc_sockstatscounter_udp6bindfail,
639         isc_sockstatscounter_udp6connectfail,
640         isc_sockstatscounter_udp6connect,
641         -1,
642         -1,
643         isc_sockstatscounter_udp6sendfail,
644         isc_sockstatscounter_udp6recvfail
645 };
646 static const isc_statscounter_t tcp4statsindex[] = {
647         isc_sockstatscounter_tcp4open,
648         isc_sockstatscounter_tcp4openfail,
649         isc_sockstatscounter_tcp4close,
650         isc_sockstatscounter_tcp4bindfail,
651         isc_sockstatscounter_tcp4connectfail,
652         isc_sockstatscounter_tcp4connect,
653         isc_sockstatscounter_tcp4acceptfail,
654         isc_sockstatscounter_tcp4accept,
655         isc_sockstatscounter_tcp4sendfail,
656         isc_sockstatscounter_tcp4recvfail
657 };
658 static const isc_statscounter_t tcp6statsindex[] = {
659         isc_sockstatscounter_tcp6open,
660         isc_sockstatscounter_tcp6openfail,
661         isc_sockstatscounter_tcp6close,
662         isc_sockstatscounter_tcp6bindfail,
663         isc_sockstatscounter_tcp6connectfail,
664         isc_sockstatscounter_tcp6connect,
665         isc_sockstatscounter_tcp6acceptfail,
666         isc_sockstatscounter_tcp6accept,
667         isc_sockstatscounter_tcp6sendfail,
668         isc_sockstatscounter_tcp6recvfail
669 };
670 static const isc_statscounter_t unixstatsindex[] = {
671         isc_sockstatscounter_unixopen,
672         isc_sockstatscounter_unixopenfail,
673         isc_sockstatscounter_unixclose,
674         isc_sockstatscounter_unixbindfail,
675         isc_sockstatscounter_unixconnectfail,
676         isc_sockstatscounter_unixconnect,
677         isc_sockstatscounter_unixacceptfail,
678         isc_sockstatscounter_unixaccept,
679         isc_sockstatscounter_unixsendfail,
680         isc_sockstatscounter_unixrecvfail
681 };
682 static const isc_statscounter_t fdwatchstatsindex[] = {
683         -1,
684         -1,
685         isc_sockstatscounter_fdwatchclose,
686         isc_sockstatscounter_fdwatchbindfail,
687         isc_sockstatscounter_fdwatchconnectfail,
688         isc_sockstatscounter_fdwatchconnect,
689         -1,
690         -1,
691         isc_sockstatscounter_fdwatchsendfail,
692         isc_sockstatscounter_fdwatchrecvfail
693 };
694
695 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) || \
696     defined(USE_WATCHER_THREAD)
697 static void
698 manager_log(isc__socketmgr_t *sockmgr,
699             isc_logcategory_t *category, isc_logmodule_t *module, int level,
700             const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
701 static void
702 manager_log(isc__socketmgr_t *sockmgr,
703             isc_logcategory_t *category, isc_logmodule_t *module, int level,
704             const char *fmt, ...)
705 {
706         char msgbuf[2048];
707         va_list ap;
708
709         if (! isc_log_wouldlog(isc_lctx, level))
710                 return;
711
712         va_start(ap, fmt);
713         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
714         va_end(ap);
715
716         isc_log_write(isc_lctx, category, module, level,
717                       "sockmgr %p: %s", sockmgr, msgbuf);
718 }
719 #endif
720
721 static void
722 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
723            isc_logcategory_t *category, isc_logmodule_t *module, int level,
724            isc_msgcat_t *msgcat, int msgset, int message,
725            const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
726 static void
727 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
728            isc_logcategory_t *category, isc_logmodule_t *module, int level,
729            isc_msgcat_t *msgcat, int msgset, int message,
730            const char *fmt, ...)
731 {
732         char msgbuf[2048];
733         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
734         va_list ap;
735
736         if (! isc_log_wouldlog(isc_lctx, level))
737                 return;
738
739         va_start(ap, fmt);
740         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
741         va_end(ap);
742
743         if (address == NULL) {
744                 isc_log_iwrite(isc_lctx, category, module, level,
745                                msgcat, msgset, message,
746                                "socket %p: %s", sock, msgbuf);
747         } else {
748                 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
749                 isc_log_iwrite(isc_lctx, category, module, level,
750                                msgcat, msgset, message,
751                                "socket %p %s: %s", sock, peerbuf, msgbuf);
752         }
753 }
754
755 #if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
756     defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
757 /*
758  * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
759  * setting IPV6_V6ONLY.
760  */
761 static void
762 FIX_IPV6_RECVPKTINFO(isc__socket_t *sock)
763 {
764         char strbuf[ISC_STRERRORSIZE];
765         int on = 1;
766
767         if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
768                 return;
769
770         if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
771                        (void *)&on, sizeof(on)) < 0) {
772
773                 isc__strerror(errno, strbuf, sizeof(strbuf));
774                 UNEXPECTED_ERROR(__FILE__, __LINE__,
775                                  "setsockopt(%d, IPV6_RECVPKTINFO) "
776                                  "%s: %s", sock->fd,
777                                  isc_msgcat_get(isc_msgcat,
778                                                 ISC_MSGSET_GENERAL,
779                                                 ISC_MSG_FAILED,
780                                                 "failed"),
781                                  strbuf);
782         }
783 }
784 #else
785 #define FIX_IPV6_RECVPKTINFO(sock) (void)0
786 #endif
787
788 /*%
789  * Increment socket-related statistics counters.
790  */
791 static inline void
792 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
793         REQUIRE(counterid != -1);
794
795         if (stats != NULL)
796                 isc_stats_increment(stats, counterid);
797 }
798
799 static inline isc_result_t
800 watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
801         isc_result_t result = ISC_R_SUCCESS;
802
803 #ifdef USE_KQUEUE
804         struct kevent evchange;
805
806         memset(&evchange, 0, sizeof(evchange));
807         if (msg == SELECT_POKE_READ)
808                 evchange.filter = EVFILT_READ;
809         else
810                 evchange.filter = EVFILT_WRITE;
811         evchange.flags = EV_ADD;
812         evchange.ident = fd;
813         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
814                 result = isc__errno2result(errno);
815
816         return (result);
817 #elif defined(USE_EPOLL)
818         struct epoll_event event;
819
820         if (msg == SELECT_POKE_READ)
821                 event.events = EPOLLIN;
822         else
823                 event.events = EPOLLOUT;
824         memset(&event.data, 0, sizeof(event.data));
825         event.data.fd = fd;
826         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
827             errno != EEXIST) {
828                 result = isc__errno2result(errno);
829         }
830
831         return (result);
832 #elif defined(USE_DEVPOLL)
833         struct pollfd pfd;
834         int lockid = FDLOCK_ID(fd);
835
836         memset(&pfd, 0, sizeof(pfd));
837         if (msg == SELECT_POKE_READ)
838                 pfd.events = POLLIN;
839         else
840                 pfd.events = POLLOUT;
841         pfd.fd = fd;
842         pfd.revents = 0;
843         LOCK(&manager->fdlock[lockid]);
844         if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
845                 result = isc__errno2result(errno);
846         else {
847                 if (msg == SELECT_POKE_READ)
848                         manager->fdpollinfo[fd].want_read = 1;
849                 else
850                         manager->fdpollinfo[fd].want_write = 1;
851         }
852         UNLOCK(&manager->fdlock[lockid]);
853
854         return (result);
855 #elif defined(USE_SELECT)
856         LOCK(&manager->lock);
857         if (msg == SELECT_POKE_READ)
858                 FD_SET(fd, manager->read_fds);
859         if (msg == SELECT_POKE_WRITE)
860                 FD_SET(fd, manager->write_fds);
861         UNLOCK(&manager->lock);
862
863         return (result);
864 #endif
865 }
866
867 static inline isc_result_t
868 unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
869         isc_result_t result = ISC_R_SUCCESS;
870
871 #ifdef USE_KQUEUE
872         struct kevent evchange;
873
874         memset(&evchange, 0, sizeof(evchange));
875         if (msg == SELECT_POKE_READ)
876                 evchange.filter = EVFILT_READ;
877         else
878                 evchange.filter = EVFILT_WRITE;
879         evchange.flags = EV_DELETE;
880         evchange.ident = fd;
881         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
882                 result = isc__errno2result(errno);
883
884         return (result);
885 #elif defined(USE_EPOLL)
886         struct epoll_event event;
887
888         if (msg == SELECT_POKE_READ)
889                 event.events = EPOLLIN;
890         else
891                 event.events = EPOLLOUT;
892         memset(&event.data, 0, sizeof(event.data));
893         event.data.fd = fd;
894         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
895             errno != ENOENT) {
896                 char strbuf[ISC_STRERRORSIZE];
897                 isc__strerror(errno, strbuf, sizeof(strbuf));
898                 UNEXPECTED_ERROR(__FILE__, __LINE__,
899                                  "epoll_ctl(DEL), %d: %s", fd, strbuf);
900                 result = ISC_R_UNEXPECTED;
901         }
902         return (result);
903 #elif defined(USE_DEVPOLL)
904         struct pollfd pfds[2];
905         size_t writelen = sizeof(pfds[0]);
906         int lockid = FDLOCK_ID(fd);
907
908         memset(pfds, 0, sizeof(pfds));
909         pfds[0].events = POLLREMOVE;
910         pfds[0].fd = fd;
911
912         /*
913          * Canceling read or write polling via /dev/poll is tricky.  Since it
914          * only provides a way of canceling per FD, we may need to re-poll the
915          * socket for the other operation.
916          */
917         LOCK(&manager->fdlock[lockid]);
918         if (msg == SELECT_POKE_READ &&
919             manager->fdpollinfo[fd].want_write == 1) {
920                 pfds[1].events = POLLOUT;
921                 pfds[1].fd = fd;
922                 writelen += sizeof(pfds[1]);
923         }
924         if (msg == SELECT_POKE_WRITE &&
925             manager->fdpollinfo[fd].want_read == 1) {
926                 pfds[1].events = POLLIN;
927                 pfds[1].fd = fd;
928                 writelen += sizeof(pfds[1]);
929         }
930
931         if (write(manager->devpoll_fd, pfds, writelen) == -1)
932                 result = isc__errno2result(errno);
933         else {
934                 if (msg == SELECT_POKE_READ)
935                         manager->fdpollinfo[fd].want_read = 0;
936                 else
937                         manager->fdpollinfo[fd].want_write = 0;
938         }
939         UNLOCK(&manager->fdlock[lockid]);
940
941         return (result);
942 #elif defined(USE_SELECT)
943         LOCK(&manager->lock);
944         if (msg == SELECT_POKE_READ)
945                 FD_CLR(fd, manager->read_fds);
946         else if (msg == SELECT_POKE_WRITE)
947                 FD_CLR(fd, manager->write_fds);
948         UNLOCK(&manager->lock);
949
950         return (result);
951 #endif
952 }
953
954 static void
955 wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
956         isc_result_t result;
957         int lockid = FDLOCK_ID(fd);
958
959         /*
960          * This is a wakeup on a socket.  If the socket is not in the
961          * process of being closed, start watching it for either reads
962          * or writes.
963          */
964
965         INSIST(fd >= 0 && fd < (int)manager->maxsocks);
966
967         if (msg == SELECT_POKE_CLOSE) {
968                 /* No one should be updating fdstate, so no need to lock it */
969                 INSIST(manager->fdstate[fd] == CLOSE_PENDING);
970                 manager->fdstate[fd] = CLOSED;
971                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
972                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
973                 (void)close(fd);
974                 return;
975         }
976
977         LOCK(&manager->fdlock[lockid]);
978         if (manager->fdstate[fd] == CLOSE_PENDING) {
979                 UNLOCK(&manager->fdlock[lockid]);
980
981                 /*
982                  * We accept (and ignore) any error from unwatch_fd() as we are
983                  * closing the socket, hoping it doesn't leave dangling state in
984                  * the kernel.
985                  * Note that unwatch_fd() must be called after releasing the
986                  * fdlock; otherwise it could cause deadlock due to a lock order
987                  * reversal.
988                  */
989                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
990                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
991                 return;
992         }
993         if (manager->fdstate[fd] != MANAGED) {
994                 UNLOCK(&manager->fdlock[lockid]);
995                 return;
996         }
997         UNLOCK(&manager->fdlock[lockid]);
998
999         /*
1000          * Set requested bit.
1001          */
1002         result = watch_fd(manager, fd, msg);
1003         if (result != ISC_R_SUCCESS) {
1004                 /*
1005                  * XXXJT: what should we do?  Ignoring the failure of watching
1006                  * a socket will make the application dysfunctional, but there
1007                  * seems to be no reasonable recovery process.
1008                  */
1009                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1010                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1011                               "failed to start watching FD (%d): %s",
1012                               fd, isc_result_totext(result));
1013         }
1014 }
1015
1016 #ifdef USE_WATCHER_THREAD
1017 /*
1018  * Poke the select loop when there is something for us to do.
1019  * The write is required (by POSIX) to complete.  That is, we
1020  * will not get partial writes.
1021  */
1022 static void
1023 select_poke(isc__socketmgr_t *mgr, int fd, int msg) {
1024         int cc;
1025         int buf[2];
1026         char strbuf[ISC_STRERRORSIZE];
1027
1028         buf[0] = fd;
1029         buf[1] = msg;
1030
1031         do {
1032                 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
1033 #ifdef ENOSR
1034                 /*
1035                  * Treat ENOSR as EAGAIN but loop slowly as it is
1036                  * unlikely to clear fast.
1037                  */
1038                 if (cc < 0 && errno == ENOSR) {
1039                         sleep(1);
1040                         errno = EAGAIN;
1041                 }
1042 #endif
1043         } while (cc < 0 && SOFT_ERROR(errno));
1044
1045         if (cc < 0) {
1046                 isc__strerror(errno, strbuf, sizeof(strbuf));
1047                 FATAL_ERROR(__FILE__, __LINE__,
1048                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
1049                                            ISC_MSG_WRITEFAILED,
1050                                            "write() failed "
1051                                            "during watcher poke: %s"),
1052                             strbuf);
1053         }
1054
1055         INSIST(cc == sizeof(buf));
1056 }
1057
1058 /*
1059  * Read a message on the internal fd.
1060  */
1061 static void
1062 select_readmsg(isc__socketmgr_t *mgr, int *fd, int *msg) {
1063         int buf[2];
1064         int cc;
1065         char strbuf[ISC_STRERRORSIZE];
1066
1067         cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
1068         if (cc < 0) {
1069                 *msg = SELECT_POKE_NOTHING;
1070                 *fd = -1;       /* Silence compiler. */
1071                 if (SOFT_ERROR(errno))
1072                         return;
1073
1074                 isc__strerror(errno, strbuf, sizeof(strbuf));
1075                 FATAL_ERROR(__FILE__, __LINE__,
1076                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
1077                                            ISC_MSG_READFAILED,
1078                                            "read() failed "
1079                                            "during watcher poke: %s"),
1080                             strbuf);
1081
1082                 return;
1083         }
1084         INSIST(cc == sizeof(buf));
1085
1086         *fd = buf[0];
1087         *msg = buf[1];
1088 }
1089 #else /* USE_WATCHER_THREAD */
1090 /*
1091  * Update the state of the socketmgr when something changes.
1092  */
1093 static void
1094 select_poke(isc__socketmgr_t *manager, int fd, int msg) {
1095         if (msg == SELECT_POKE_SHUTDOWN)
1096                 return;
1097         else if (fd >= 0)
1098                 wakeup_socket(manager, fd, msg);
1099         return;
1100 }
1101 #endif /* USE_WATCHER_THREAD */
1102
1103 /*
1104  * Make a fd non-blocking.
1105  */
1106 static isc_result_t
1107 make_nonblock(int fd) {
1108         int ret;
1109         int flags;
1110         char strbuf[ISC_STRERRORSIZE];
1111 #ifdef USE_FIONBIO_IOCTL
1112         int on = 1;
1113
1114         ret = ioctl(fd, FIONBIO, (char *)&on);
1115 #else
1116         flags = fcntl(fd, F_GETFL, 0);
1117         flags |= PORT_NONBLOCK;
1118         ret = fcntl(fd, F_SETFL, flags);
1119 #endif
1120
1121         if (ret == -1) {
1122                 isc__strerror(errno, strbuf, sizeof(strbuf));
1123                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1124 #ifdef USE_FIONBIO_IOCTL
1125                                  "ioctl(%d, FIONBIO, &on): %s", fd,
1126 #else
1127                                  "fcntl(%d, F_SETFL, %d): %s", fd, flags,
1128 #endif
1129                                  strbuf);
1130
1131                 return (ISC_R_UNEXPECTED);
1132         }
1133
1134         return (ISC_R_SUCCESS);
1135 }
1136
1137 #ifdef USE_CMSG
1138 /*
1139  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
1140  * In order to ensure as much portability as possible, we provide wrapper
1141  * functions of these macros.
1142  * Note that cmsg_space() could run slow on OSes that do not have
1143  * CMSG_SPACE.
1144  */
1145 static inline ISC_SOCKADDR_LEN_T
1146 cmsg_len(ISC_SOCKADDR_LEN_T len) {
1147 #ifdef CMSG_LEN
1148         return (CMSG_LEN(len));
1149 #else
1150         ISC_SOCKADDR_LEN_T hdrlen;
1151
1152         /*
1153          * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
1154          * is correct.
1155          */
1156         hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
1157         return (hdrlen + len);
1158 #endif
1159 }
1160
1161 static inline ISC_SOCKADDR_LEN_T
1162 cmsg_space(ISC_SOCKADDR_LEN_T len) {
1163 #ifdef CMSG_SPACE
1164         return (CMSG_SPACE(len));
1165 #else
1166         struct msghdr msg;
1167         struct cmsghdr *cmsgp;
1168         /*
1169          * XXX: The buffer length is an ad-hoc value, but should be enough
1170          * in a practical sense.
1171          */
1172         char dummybuf[sizeof(struct cmsghdr) + 1024];
1173
1174         memset(&msg, 0, sizeof(msg));
1175         msg.msg_control = dummybuf;
1176         msg.msg_controllen = sizeof(dummybuf);
1177
1178         cmsgp = (struct cmsghdr *)dummybuf;
1179         cmsgp->cmsg_len = cmsg_len(len);
1180
1181         cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1182         if (cmsgp != NULL)
1183                 return ((char *)cmsgp - (char *)msg.msg_control);
1184         else
1185                 return (0);
1186 #endif
1187 }
1188 #endif /* USE_CMSG */
1189
1190 /*
1191  * Process control messages received on a socket.
1192  */
1193 static void
1194 process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1195 #ifdef USE_CMSG
1196         struct cmsghdr *cmsgp;
1197 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1198         struct in6_pktinfo *pktinfop;
1199 #endif
1200 #ifdef SO_TIMESTAMP
1201         void *timevalp;
1202 #endif
1203 #endif
1204
1205         /*
1206          * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1207          * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1208          * They are all here, outside of the CPP tests, because it is
1209          * more consistent with the usual ISC coding style.
1210          */
1211         UNUSED(sock);
1212         UNUSED(msg);
1213         UNUSED(dev);
1214
1215 #ifdef ISC_NET_BSD44MSGHDR
1216
1217 #ifdef MSG_TRUNC
1218         if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
1219                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1220 #endif
1221
1222 #ifdef MSG_CTRUNC
1223         if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
1224                 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1225 #endif
1226
1227 #ifndef USE_CMSG
1228         return;
1229 #else
1230         if (msg->msg_controllen == 0U || msg->msg_control == NULL)
1231                 return;
1232
1233 #ifdef SO_TIMESTAMP
1234         timevalp = NULL;
1235 #endif
1236 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1237         pktinfop = NULL;
1238 #endif
1239
1240         cmsgp = CMSG_FIRSTHDR(msg);
1241         while (cmsgp != NULL) {
1242                 socket_log(sock, NULL, TRACE,
1243                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
1244                            "processing cmsg %p", cmsgp);
1245
1246 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1247                 if (cmsgp->cmsg_level == IPPROTO_IPV6
1248                     && cmsgp->cmsg_type == IPV6_PKTINFO) {
1249
1250                         pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1251                         memmove(&dev->pktinfo, pktinfop,
1252                                 sizeof(struct in6_pktinfo));
1253                         dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1254                         socket_log(sock, NULL, TRACE,
1255                                    isc_msgcat, ISC_MSGSET_SOCKET,
1256                                    ISC_MSG_IFRECEIVED,
1257                                    "interface received on ifindex %u",
1258                                    dev->pktinfo.ipi6_ifindex);
1259                         if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
1260                                 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1261                         goto next;
1262                 }
1263 #endif
1264
1265 #ifdef SO_TIMESTAMP
1266                 if (cmsgp->cmsg_level == SOL_SOCKET
1267                     && cmsgp->cmsg_type == SCM_TIMESTAMP) {
1268                         struct timeval tv;
1269                         timevalp = CMSG_DATA(cmsgp);
1270                         memmove(&tv, timevalp, sizeof(tv));
1271                         dev->timestamp.seconds = tv.tv_sec;
1272                         dev->timestamp.nanoseconds = tv.tv_usec * 1000;
1273                         dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1274                         goto next;
1275                 }
1276 #endif
1277
1278         next:
1279                 cmsgp = CMSG_NXTHDR(msg, cmsgp);
1280         }
1281 #endif /* USE_CMSG */
1282
1283 #endif /* ISC_NET_BSD44MSGHDR */
1284 }
1285
1286 /*
1287  * Construct an iov array and attach it to the msghdr passed in.  This is
1288  * the SEND constructor, which will use the used region of the buffer
1289  * (if using a buffer list) or will use the internal region (if a single
1290  * buffer I/O is requested).
1291  *
1292  * Nothing can be NULL, and the done event must list at least one buffer
1293  * on the buffer linked list for this function to be meaningful.
1294  *
1295  * If write_countp != NULL, *write_countp will hold the number of bytes
1296  * this transaction can send.
1297  */
1298 static void
1299 build_msghdr_send(isc__socket_t *sock, isc_socketevent_t *dev,
1300                   struct msghdr *msg, struct iovec *iov, size_t *write_countp)
1301 {
1302         unsigned int iovcount;
1303         isc_buffer_t *buffer;
1304         isc_region_t used;
1305         size_t write_count;
1306         size_t skip_count;
1307
1308         memset(msg, 0, sizeof(*msg));
1309
1310         if (!sock->connected) {
1311                 msg->msg_name = (void *)&dev->address.type.sa;
1312                 msg->msg_namelen = dev->address.length;
1313         } else {
1314                 msg->msg_name = NULL;
1315                 msg->msg_namelen = 0;
1316         }
1317
1318         buffer = ISC_LIST_HEAD(dev->bufferlist);
1319         write_count = 0;
1320         iovcount = 0;
1321
1322         /*
1323          * Single buffer I/O?  Skip what we've done so far in this region.
1324          */
1325         if (buffer == NULL) {
1326                 write_count = dev->region.length - dev->n;
1327                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1328                 iov[0].iov_len = write_count;
1329                 iovcount = 1;
1330
1331                 goto config;
1332         }
1333
1334         /*
1335          * Multibuffer I/O.
1336          * Skip the data in the buffer list that we have already written.
1337          */
1338         skip_count = dev->n;
1339         while (buffer != NULL) {
1340                 REQUIRE(ISC_BUFFER_VALID(buffer));
1341                 if (skip_count < isc_buffer_usedlength(buffer))
1342                         break;
1343                 skip_count -= isc_buffer_usedlength(buffer);
1344                 buffer = ISC_LIST_NEXT(buffer, link);
1345         }
1346
1347         while (buffer != NULL) {
1348                 INSIST(iovcount < MAXSCATTERGATHER_SEND);
1349
1350                 isc_buffer_usedregion(buffer, &used);
1351
1352                 if (used.length > 0) {
1353                         iov[iovcount].iov_base = (void *)(used.base
1354                                                           + skip_count);
1355                         iov[iovcount].iov_len = used.length - skip_count;
1356                         write_count += (used.length - skip_count);
1357                         skip_count = 0;
1358                         iovcount++;
1359                 }
1360                 buffer = ISC_LIST_NEXT(buffer, link);
1361         }
1362
1363         INSIST(skip_count == 0U);
1364
1365  config:
1366         msg->msg_iov = iov;
1367         msg->msg_iovlen = iovcount;
1368
1369 #ifdef ISC_NET_BSD44MSGHDR
1370         msg->msg_control = NULL;
1371         msg->msg_controllen = 0;
1372         msg->msg_flags = 0;
1373 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1374         if ((sock->type == isc_sockettype_udp)
1375             && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
1376 #if defined(IPV6_USE_MIN_MTU)
1377                 int use_min_mtu = 1;    /* -1, 0, 1 */
1378 #endif
1379                 struct cmsghdr *cmsgp;
1380                 struct in6_pktinfo *pktinfop;
1381
1382                 socket_log(sock, NULL, TRACE,
1383                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
1384                            "sendto pktinfo data, ifindex %u",
1385                            dev->pktinfo.ipi6_ifindex);
1386
1387                 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1388                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1389                 msg->msg_control = (void *)sock->sendcmsgbuf;
1390
1391                 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
1392                 cmsgp->cmsg_level = IPPROTO_IPV6;
1393                 cmsgp->cmsg_type = IPV6_PKTINFO;
1394                 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1395                 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1396                 memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1397 #if defined(IPV6_USE_MIN_MTU)
1398                 /*
1399                  * Set IPV6_USE_MIN_MTU as a per packet option as FreeBSD
1400                  * ignores setsockopt(IPV6_USE_MIN_MTU) when IPV6_PKTINFO
1401                  * is used.
1402                  */
1403                 cmsgp = (struct cmsghdr *)(sock->sendcmsgbuf +
1404                                            msg->msg_controllen);
1405                 msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
1406                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1407
1408                 cmsgp->cmsg_level = IPPROTO_IPV6;
1409                 cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
1410                 cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
1411                 memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
1412 #endif
1413         }
1414 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
1415 #else /* ISC_NET_BSD44MSGHDR */
1416         msg->msg_accrights = NULL;
1417         msg->msg_accrightslen = 0;
1418 #endif /* ISC_NET_BSD44MSGHDR */
1419
1420         if (write_countp != NULL)
1421                 *write_countp = write_count;
1422 }
1423
1424 /*
1425  * Construct an iov array and attach it to the msghdr passed in.  This is
1426  * the RECV constructor, which will use the available region of the buffer
1427  * (if using a buffer list) or will use the internal region (if a single
1428  * buffer I/O is requested).
1429  *
1430  * Nothing can be NULL, and the done event must list at least one buffer
1431  * on the buffer linked list for this function to be meaningful.
1432  *
1433  * If read_countp != NULL, *read_countp will hold the number of bytes
1434  * this transaction can receive.
1435  */
1436 static void
1437 build_msghdr_recv(isc__socket_t *sock, isc_socketevent_t *dev,
1438                   struct msghdr *msg, struct iovec *iov, size_t *read_countp)
1439 {
1440         unsigned int iovcount;
1441         isc_buffer_t *buffer;
1442         isc_region_t available;
1443         size_t read_count;
1444
1445         memset(msg, 0, sizeof(struct msghdr));
1446
1447         if (sock->type == isc_sockettype_udp) {
1448                 memset(&dev->address, 0, sizeof(dev->address));
1449 #ifdef BROKEN_RECVMSG
1450                 if (sock->pf == AF_INET) {
1451                         msg->msg_name = (void *)&dev->address.type.sin;
1452                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1453                 } else if (sock->pf == AF_INET6) {
1454                         msg->msg_name = (void *)&dev->address.type.sin6;
1455                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1456 #ifdef ISC_PLATFORM_HAVESYSUNH
1457                 } else if (sock->pf == AF_UNIX) {
1458                         msg->msg_name = (void *)&dev->address.type.sunix;
1459                         msg->msg_namelen = sizeof(dev->address.type.sunix);
1460 #endif
1461                 } else {
1462                         msg->msg_name = (void *)&dev->address.type.sa;
1463                         msg->msg_namelen = sizeof(dev->address.type);
1464                 }
1465 #else
1466                 msg->msg_name = (void *)&dev->address.type.sa;
1467                 msg->msg_namelen = sizeof(dev->address.type);
1468 #endif
1469 #ifdef ISC_NET_RECVOVERFLOW
1470                 /* If needed, steal one iovec for overflow detection. */
1471                 maxiov--;
1472 #endif
1473         } else { /* TCP */
1474                 msg->msg_name = NULL;
1475                 msg->msg_namelen = 0;
1476                 dev->address = sock->peer_address;
1477         }
1478
1479         buffer = ISC_LIST_HEAD(dev->bufferlist);
1480         read_count = 0;
1481
1482         /*
1483          * Single buffer I/O?  Skip what we've done so far in this region.
1484          */
1485         if (buffer == NULL) {
1486                 read_count = dev->region.length - dev->n;
1487                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1488                 iov[0].iov_len = read_count;
1489                 iovcount = 1;
1490
1491                 goto config;
1492         }
1493
1494         /*
1495          * Multibuffer I/O.
1496          * Skip empty buffers.
1497          */
1498         while (buffer != NULL) {
1499                 REQUIRE(ISC_BUFFER_VALID(buffer));
1500                 if (isc_buffer_availablelength(buffer) != 0)
1501                         break;
1502                 buffer = ISC_LIST_NEXT(buffer, link);
1503         }
1504
1505         iovcount = 0;
1506         while (buffer != NULL) {
1507                 INSIST(iovcount < MAXSCATTERGATHER_RECV);
1508
1509                 isc_buffer_availableregion(buffer, &available);
1510
1511                 if (available.length > 0) {
1512                         iov[iovcount].iov_base = (void *)(available.base);
1513                         iov[iovcount].iov_len = available.length;
1514                         read_count += available.length;
1515                         iovcount++;
1516                 }
1517                 buffer = ISC_LIST_NEXT(buffer, link);
1518         }
1519
1520  config:
1521
1522         /*
1523          * If needed, set up to receive that one extra byte.  Note that
1524          * we know there is at least one iov left, since we stole it
1525          * at the top of this function.
1526          */
1527 #ifdef ISC_NET_RECVOVERFLOW
1528         if (sock->type == isc_sockettype_udp) {
1529                 iov[iovcount].iov_base = (void *)(&sock->overflow);
1530                 iov[iovcount].iov_len = 1;
1531                 iovcount++;
1532         }
1533 #endif
1534
1535         msg->msg_iov = iov;
1536         msg->msg_iovlen = iovcount;
1537
1538 #ifdef ISC_NET_BSD44MSGHDR
1539         msg->msg_control = NULL;
1540         msg->msg_controllen = 0;
1541         msg->msg_flags = 0;
1542 #if defined(USE_CMSG)
1543         if (sock->type == isc_sockettype_udp) {
1544                 msg->msg_control = sock->recvcmsgbuf;
1545                 msg->msg_controllen = sock->recvcmsgbuflen;
1546         }
1547 #endif /* USE_CMSG */
1548 #else /* ISC_NET_BSD44MSGHDR */
1549         msg->msg_accrights = NULL;
1550         msg->msg_accrightslen = 0;
1551 #endif /* ISC_NET_BSD44MSGHDR */
1552
1553         if (read_countp != NULL)
1554                 *read_countp = read_count;
1555 }
1556
1557 static void
1558 set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock,
1559                 isc_socketevent_t *dev)
1560 {
1561         if (sock->type == isc_sockettype_udp) {
1562                 if (address != NULL)
1563                         dev->address = *address;
1564                 else
1565                         dev->address = sock->peer_address;
1566         } else if (sock->type == isc_sockettype_tcp) {
1567                 INSIST(address == NULL);
1568                 dev->address = sock->peer_address;
1569         }
1570 }
1571
1572 static void
1573 destroy_socketevent(isc_event_t *event) {
1574         isc_socketevent_t *ev = (isc_socketevent_t *)event;
1575
1576         INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1577
1578         (ev->destroy)(event);
1579 }
1580
1581 static isc_socketevent_t *
1582 allocate_socketevent(isc__socket_t *sock, isc_eventtype_t eventtype,
1583                      isc_taskaction_t action, const void *arg)
1584 {
1585         isc_socketevent_t *ev;
1586
1587         ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1588                                                      sock, eventtype,
1589                                                      action, arg,
1590                                                      sizeof(*ev));
1591
1592         if (ev == NULL)
1593                 return (NULL);
1594
1595         ev->result = ISC_R_UNSET;
1596         ISC_LINK_INIT(ev, ev_link);
1597         ISC_LIST_INIT(ev->bufferlist);
1598         ev->region.base = NULL;
1599         ev->n = 0;
1600         ev->offset = 0;
1601         ev->attributes = 0;
1602         ev->destroy = ev->ev_destroy;
1603         ev->ev_destroy = destroy_socketevent;
1604
1605         return (ev);
1606 }
1607
1608 #if defined(ISC_SOCKET_DEBUG)
1609 static void
1610 dump_msg(struct msghdr *msg) {
1611         unsigned int i;
1612
1613         printf("MSGHDR %p\n", msg);
1614         printf("\tname %p, namelen %ld\n", msg->msg_name,
1615                (long) msg->msg_namelen);
1616         printf("\tiov %p, iovlen %ld\n", msg->msg_iov,
1617                (long) msg->msg_iovlen);
1618         for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1619                 printf("\t\t%d\tbase %p, len %ld\n", i,
1620                        msg->msg_iov[i].iov_base,
1621                        (long) msg->msg_iov[i].iov_len);
1622 #ifdef ISC_NET_BSD44MSGHDR
1623         printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1624                (long) msg->msg_controllen);
1625 #endif
1626 }
1627 #endif
1628
1629 #define DOIO_SUCCESS            0       /* i/o ok, event sent */
1630 #define DOIO_SOFT               1       /* i/o ok, soft error, no event sent */
1631 #define DOIO_HARD               2       /* i/o error, event sent */
1632 #define DOIO_EOF                3       /* EOF, no event sent */
1633
1634 static int
1635 doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
1636         int cc;
1637         struct iovec iov[MAXSCATTERGATHER_RECV];
1638         size_t read_count;
1639         size_t actual_count;
1640         struct msghdr msghdr;
1641         isc_buffer_t *buffer;
1642         int recv_errno;
1643         char strbuf[ISC_STRERRORSIZE];
1644
1645         build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
1646
1647 #if defined(ISC_SOCKET_DEBUG)
1648         dump_msg(&msghdr);
1649 #endif
1650
1651         cc = recvmsg(sock->fd, &msghdr, 0);
1652         recv_errno = errno;
1653
1654 #if defined(ISC_SOCKET_DEBUG)
1655         dump_msg(&msghdr);
1656 #endif
1657
1658         if (cc < 0) {
1659                 if (SOFT_ERROR(recv_errno))
1660                         return (DOIO_SOFT);
1661
1662                 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1663                         isc__strerror(recv_errno, strbuf, sizeof(strbuf));
1664                         socket_log(sock, NULL, IOEVENT,
1665                                    isc_msgcat, ISC_MSGSET_SOCKET,
1666                                    ISC_MSG_DOIORECV,
1667                                   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1668                                    sock->fd, cc, recv_errno, strbuf);
1669                 }
1670
1671 #define SOFT_OR_HARD(_system, _isc) \
1672         if (recv_errno == _system) { \
1673                 if (sock->connected) { \
1674                         dev->result = _isc; \
1675                         inc_stats(sock->manager->stats, \
1676                                   sock->statsindex[STATID_RECVFAIL]); \
1677                         return (DOIO_HARD); \
1678                 } \
1679                 return (DOIO_SOFT); \
1680         }
1681 #define ALWAYS_HARD(_system, _isc) \
1682         if (recv_errno == _system) { \
1683                 dev->result = _isc; \
1684                 inc_stats(sock->manager->stats, \
1685                           sock->statsindex[STATID_RECVFAIL]); \
1686                 return (DOIO_HARD); \
1687         }
1688
1689                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1690                 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1691                 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1692                 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1693                 /* HPUX 11.11 can return EADDRNOTAVAIL. */
1694                 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1695                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1696                 /* Should never get this one but it was seen. */
1697 #ifdef ENOPROTOOPT
1698                 SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
1699 #endif
1700                 /*
1701                  * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
1702                  * errors.
1703                  */
1704 #ifdef EPROTO
1705                 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1706 #endif
1707                 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1708
1709 #undef SOFT_OR_HARD
1710 #undef ALWAYS_HARD
1711
1712                 dev->result = isc__errno2result(recv_errno);
1713                 inc_stats(sock->manager->stats,
1714                           sock->statsindex[STATID_RECVFAIL]);
1715                 return (DOIO_HARD);
1716         }
1717
1718         /*
1719          * On TCP and UNIX sockets, zero length reads indicate EOF,
1720          * while on UDP sockets, zero length reads are perfectly valid,
1721          * although strange.
1722          */
1723         switch (sock->type) {
1724         case isc_sockettype_tcp:
1725         case isc_sockettype_unix:
1726                 if (cc == 0)
1727                         return (DOIO_EOF);
1728                 break;
1729         case isc_sockettype_udp:
1730                 break;
1731         case isc_sockettype_fdwatch:
1732         default:
1733                 INSIST(0);
1734         }
1735
1736         if (sock->type == isc_sockettype_udp) {
1737                 dev->address.length = msghdr.msg_namelen;
1738                 if (isc_sockaddr_getport(&dev->address) == 0) {
1739                         if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1740                                 socket_log(sock, &dev->address, IOEVENT,
1741                                            isc_msgcat, ISC_MSGSET_SOCKET,
1742                                            ISC_MSG_ZEROPORT,
1743                                            "dropping source port zero packet");
1744                         }
1745                         return (DOIO_SOFT);
1746                 }
1747                 /*
1748                  * Simulate a firewall blocking UDP responses bigger than
1749                  * 512 bytes.
1750                  */
1751                 if (sock->manager->maxudp != 0 && cc > sock->manager->maxudp)
1752                         return (DOIO_SOFT);
1753         }
1754
1755         socket_log(sock, &dev->address, IOEVENT,
1756                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1757                    "packet received correctly");
1758
1759         /*
1760          * Overflow bit detection.  If we received MORE bytes than we should,
1761          * this indicates an overflow situation.  Set the flag in the
1762          * dev entry and adjust how much we read by one.
1763          */
1764 #ifdef ISC_NET_RECVOVERFLOW
1765         if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1766                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1767                 cc--;
1768         }
1769 #endif
1770
1771         /*
1772          * If there are control messages attached, run through them and pull
1773          * out the interesting bits.
1774          */
1775         if (sock->type == isc_sockettype_udp)
1776                 process_cmsg(sock, &msghdr, dev);
1777
1778         /*
1779          * update the buffers (if any) and the i/o count
1780          */
1781         dev->n += cc;
1782         actual_count = cc;
1783         buffer = ISC_LIST_HEAD(dev->bufferlist);
1784         while (buffer != NULL && actual_count > 0U) {
1785                 REQUIRE(ISC_BUFFER_VALID(buffer));
1786                 if (isc_buffer_availablelength(buffer) <= actual_count) {
1787                         actual_count -= isc_buffer_availablelength(buffer);
1788                         isc_buffer_add(buffer,
1789                                        isc_buffer_availablelength(buffer));
1790                 } else {
1791                         isc_buffer_add(buffer, actual_count);
1792                         actual_count = 0;
1793                         POST(actual_count);
1794                         break;
1795                 }
1796                 buffer = ISC_LIST_NEXT(buffer, link);
1797                 if (buffer == NULL) {
1798                         INSIST(actual_count == 0U);
1799                 }
1800         }
1801
1802         /*
1803          * If we read less than we expected, update counters,
1804          * and let the upper layer poke the descriptor.
1805          */
1806         if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1807                 return (DOIO_SOFT);
1808
1809         /*
1810          * Full reads are posted, or partials if partials are ok.
1811          */
1812         dev->result = ISC_R_SUCCESS;
1813         return (DOIO_SUCCESS);
1814 }
1815
1816 /*
1817  * Returns:
1818  *      DOIO_SUCCESS    The operation succeeded.  dev->result contains
1819  *                      ISC_R_SUCCESS.
1820  *
1821  *      DOIO_HARD       A hard or unexpected I/O error was encountered.
1822  *                      dev->result contains the appropriate error.
1823  *
1824  *      DOIO_SOFT       A soft I/O error was encountered.  No senddone
1825  *                      event was sent.  The operation should be retried.
1826  *
1827  *      No other return values are possible.
1828  */
1829 static int
1830 doio_send(isc__socket_t *sock, isc_socketevent_t *dev) {
1831         int cc;
1832         struct iovec iov[MAXSCATTERGATHER_SEND];
1833         size_t write_count;
1834         struct msghdr msghdr;
1835         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1836         int attempts = 0;
1837         int send_errno;
1838         char strbuf[ISC_STRERRORSIZE];
1839
1840         build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1841
1842  resend:
1843         cc = sendmsg(sock->fd, &msghdr, 0);
1844         send_errno = errno;
1845
1846         /*
1847          * Check for error or block condition.
1848          */
1849         if (cc < 0) {
1850                 if (send_errno == EINTR && ++attempts < NRETRIES)
1851                         goto resend;
1852
1853                 if (SOFT_ERROR(send_errno))
1854                         return (DOIO_SOFT);
1855
1856 #define SOFT_OR_HARD(_system, _isc) \
1857         if (send_errno == _system) { \
1858                 if (sock->connected) { \
1859                         dev->result = _isc; \
1860                         inc_stats(sock->manager->stats, \
1861                                   sock->statsindex[STATID_SENDFAIL]); \
1862                         return (DOIO_HARD); \
1863                 } \
1864                 return (DOIO_SOFT); \
1865         }
1866 #define ALWAYS_HARD(_system, _isc) \
1867         if (send_errno == _system) { \
1868                 dev->result = _isc; \
1869                 inc_stats(sock->manager->stats, \
1870                           sock->statsindex[STATID_SENDFAIL]); \
1871                 return (DOIO_HARD); \
1872         }
1873
1874                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1875                 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1876                 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1877                 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1878                 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1879 #ifdef EHOSTDOWN
1880                 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1881 #endif
1882                 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1883                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1884                 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1885                 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1886                 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1887
1888 #undef SOFT_OR_HARD
1889 #undef ALWAYS_HARD
1890
1891                 /*
1892                  * The other error types depend on whether or not the
1893                  * socket is UDP or TCP.  If it is UDP, some errors
1894                  * that we expect to be fatal under TCP are merely
1895                  * annoying, and are really soft errors.
1896                  *
1897                  * However, these soft errors are still returned as
1898                  * a status.
1899                  */
1900                 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1901                 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1902                 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1903                                  addrbuf, strbuf);
1904                 dev->result = isc__errno2result(send_errno);
1905                 inc_stats(sock->manager->stats,
1906                           sock->statsindex[STATID_SENDFAIL]);
1907                 return (DOIO_HARD);
1908         }
1909
1910         if (cc == 0) {
1911                 inc_stats(sock->manager->stats,
1912                           sock->statsindex[STATID_SENDFAIL]);
1913                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1914                                  "doio_send: send() %s 0",
1915                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1916                                                 ISC_MSG_RETURNED, "returned"));
1917         }
1918
1919         /*
1920          * If we write less than we expected, update counters, poke.
1921          */
1922         dev->n += cc;
1923         if ((size_t)cc != write_count)
1924                 return (DOIO_SOFT);
1925
1926         /*
1927          * Exactly what we wanted to write.  We're done with this
1928          * entry.  Post its completion event.
1929          */
1930         dev->result = ISC_R_SUCCESS;
1931         return (DOIO_SUCCESS);
1932 }
1933
1934 /*
1935  * Kill.
1936  *
1937  * Caller must ensure that the socket is not locked and no external
1938  * references exist.
1939  */
1940 static void
1941 closesocket(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) {
1942         isc_sockettype_t type = sock->type;
1943         int lockid = FDLOCK_ID(fd);
1944
1945         /*
1946          * No one has this socket open, so the watcher doesn't have to be
1947          * poked, and the socket doesn't have to be locked.
1948          */
1949         LOCK(&manager->fdlock[lockid]);
1950         manager->fds[fd] = NULL;
1951         if (type == isc_sockettype_fdwatch)
1952                 manager->fdstate[fd] = CLOSED;
1953         else
1954                 manager->fdstate[fd] = CLOSE_PENDING;
1955         UNLOCK(&manager->fdlock[lockid]);
1956         if (type == isc_sockettype_fdwatch) {
1957                 /*
1958                  * The caller may close the socket once this function returns,
1959                  * and `fd' may be reassigned for a new socket.  So we do
1960                  * unwatch_fd() here, rather than defer it via select_poke().
1961                  * Note: this may complicate data protection among threads and
1962                  * may reduce performance due to additional locks.  One way to
1963                  * solve this would be to dup() the watched descriptor, but we
1964                  * take a simpler approach at this moment.
1965                  */
1966                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1967                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1968         } else
1969                 select_poke(manager, fd, SELECT_POKE_CLOSE);
1970
1971         inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]);
1972
1973         /*
1974          * update manager->maxfd here (XXX: this should be implemented more
1975          * efficiently)
1976          */
1977 #ifdef USE_SELECT
1978         LOCK(&manager->lock);
1979         if (manager->maxfd == fd) {
1980                 int i;
1981
1982                 manager->maxfd = 0;
1983                 for (i = fd - 1; i >= 0; i--) {
1984                         lockid = FDLOCK_ID(i);
1985
1986                         LOCK(&manager->fdlock[lockid]);
1987                         if (manager->fdstate[i] == MANAGED) {
1988                                 manager->maxfd = i;
1989                                 UNLOCK(&manager->fdlock[lockid]);
1990                                 break;
1991                         }
1992                         UNLOCK(&manager->fdlock[lockid]);
1993                 }
1994 #ifdef ISC_PLATFORM_USETHREADS
1995                 if (manager->maxfd < manager->pipe_fds[0])
1996                         manager->maxfd = manager->pipe_fds[0];
1997 #endif
1998         }
1999         UNLOCK(&manager->lock);
2000 #endif  /* USE_SELECT */
2001 }
2002
2003 static void
2004 destroy(isc__socket_t **sockp) {
2005         int fd;
2006         isc__socket_t *sock = *sockp;
2007         isc__socketmgr_t *manager = sock->manager;
2008
2009         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2010                    ISC_MSG_DESTROYING, "destroying");
2011
2012         INSIST(ISC_LIST_EMPTY(sock->accept_list));
2013         INSIST(ISC_LIST_EMPTY(sock->recv_list));
2014         INSIST(ISC_LIST_EMPTY(sock->send_list));
2015         INSIST(sock->connect_ev == NULL);
2016         REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks);
2017
2018         if (sock->fd >= 0) {
2019                 fd = sock->fd;
2020                 sock->fd = -1;
2021                 closesocket(manager, sock, fd);
2022         }
2023
2024         LOCK(&manager->lock);
2025
2026         ISC_LIST_UNLINK(manager->socklist, sock, link);
2027
2028 #ifdef USE_WATCHER_THREAD
2029         if (ISC_LIST_EMPTY(manager->socklist))
2030                 SIGNAL(&manager->shutdown_ok);
2031 #endif /* USE_WATCHER_THREAD */
2032
2033         /* can't unlock manager as its memory context is still used */
2034         free_socket(sockp);
2035
2036         UNLOCK(&manager->lock);
2037 }
2038
2039 static isc_result_t
2040 allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type,
2041                 isc__socket_t **socketp)
2042 {
2043         isc__socket_t *sock;
2044         isc_result_t result;
2045         ISC_SOCKADDR_LEN_T cmsgbuflen;
2046
2047         sock = isc_mem_get(manager->mctx, sizeof(*sock));
2048
2049         if (sock == NULL)
2050                 return (ISC_R_NOMEMORY);
2051
2052         sock->common.magic = 0;
2053         sock->common.impmagic = 0;
2054         sock->references = 0;
2055
2056         sock->manager = manager;
2057         sock->type = type;
2058         sock->fd = -1;
2059         sock->statsindex = NULL;
2060
2061         ISC_LINK_INIT(sock, link);
2062
2063         sock->recvcmsgbuf = NULL;
2064         sock->sendcmsgbuf = NULL;
2065
2066         /*
2067          * Set up cmsg buffers.
2068          */
2069         cmsgbuflen = 0;
2070 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
2071         cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo));
2072 #endif
2073 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
2074         cmsgbuflen += cmsg_space(sizeof(struct timeval));
2075 #endif
2076         sock->recvcmsgbuflen = cmsgbuflen;
2077         if (sock->recvcmsgbuflen != 0U) {
2078                 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
2079                 if (sock->recvcmsgbuf == NULL) {
2080                         result = ISC_R_NOMEMORY;
2081                         goto error;
2082                 }
2083         }
2084
2085         cmsgbuflen = 0;
2086 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
2087         cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo));
2088 #if defined(IPV6_USE_MIN_MTU)
2089         /*
2090          * Provide space for working around FreeBSD's broken IPV6_USE_MIN_MTU
2091          * support.
2092          */
2093         cmsgbuflen += cmsg_space(sizeof(int));
2094 #endif
2095 #endif
2096         sock->sendcmsgbuflen = cmsgbuflen;
2097         if (sock->sendcmsgbuflen != 0U) {
2098                 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
2099                 if (sock->sendcmsgbuf == NULL) {
2100                         result = ISC_R_NOMEMORY;
2101                         goto error;
2102                 }
2103         }
2104
2105         memset(sock->name, 0, sizeof(sock->name));
2106         sock->tag = NULL;
2107
2108         /*
2109          * Set up list of readers and writers to be initially empty.
2110          */
2111         ISC_LIST_INIT(sock->recv_list);
2112         ISC_LIST_INIT(sock->send_list);
2113         ISC_LIST_INIT(sock->accept_list);
2114         sock->connect_ev = NULL;
2115         sock->pending_recv = 0;
2116         sock->pending_send = 0;
2117         sock->pending_accept = 0;
2118         sock->listener = 0;
2119         sock->connected = 0;
2120         sock->connecting = 0;
2121         sock->bound = 0;
2122
2123         /*
2124          * Initialize the lock.
2125          */
2126         result = isc_mutex_init(&sock->lock);
2127         if (result != ISC_R_SUCCESS) {
2128                 sock->common.magic = 0;
2129                 sock->common.impmagic = 0;
2130                 goto error;
2131         }
2132
2133         /*
2134          * Initialize readable and writable events.
2135          */
2136         ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
2137                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
2138                        NULL, sock, sock, NULL, NULL);
2139         ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
2140                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
2141                        NULL, sock, sock, NULL, NULL);
2142
2143         sock->common.magic = ISCAPI_SOCKET_MAGIC;
2144         sock->common.impmagic = SOCKET_MAGIC;
2145         *socketp = sock;
2146
2147         return (ISC_R_SUCCESS);
2148
2149  error:
2150         if (sock->recvcmsgbuf != NULL)
2151                 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
2152                             sock->recvcmsgbuflen);
2153         if (sock->sendcmsgbuf != NULL)
2154                 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
2155                             sock->sendcmsgbuflen);
2156         isc_mem_put(manager->mctx, sock, sizeof(*sock));
2157
2158         return (result);
2159 }
2160
2161 /*
2162  * This event requires that the various lists be empty, that the reference
2163  * count be 1, and that the magic number is valid.  The other socket bits,
2164  * like the lock, must be initialized as well.  The fd associated must be
2165  * marked as closed, by setting it to -1 on close, or this routine will
2166  * also close the socket.
2167  */
2168 static void
2169 free_socket(isc__socket_t **socketp) {
2170         isc__socket_t *sock = *socketp;
2171
2172         INSIST(sock->references == 0);
2173         INSIST(VALID_SOCKET(sock));
2174         INSIST(!sock->connecting);
2175         INSIST(!sock->pending_recv);
2176         INSIST(!sock->pending_send);
2177         INSIST(!sock->pending_accept);
2178         INSIST(ISC_LIST_EMPTY(sock->recv_list));
2179         INSIST(ISC_LIST_EMPTY(sock->send_list));
2180         INSIST(ISC_LIST_EMPTY(sock->accept_list));
2181         INSIST(!ISC_LINK_LINKED(sock, link));
2182
2183         if (sock->recvcmsgbuf != NULL)
2184                 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
2185                             sock->recvcmsgbuflen);
2186         if (sock->sendcmsgbuf != NULL)
2187                 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
2188                             sock->sendcmsgbuflen);
2189
2190         sock->common.magic = 0;
2191         sock->common.impmagic = 0;
2192
2193         DESTROYLOCK(&sock->lock);
2194
2195         isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
2196
2197         *socketp = NULL;
2198 }
2199
2200 #ifdef SO_BSDCOMPAT
2201 /*
2202  * This really should not be necessary to do.  Having to workout
2203  * which kernel version we are on at run time so that we don't cause
2204  * the kernel to issue a warning about us using a deprecated socket option.
2205  * Such warnings should *never* be on by default in production kernels.
2206  *
2207  * We can't do this a build time because executables are moved between
2208  * machines and hence kernels.
2209  *
2210  * We can't just not set SO_BSDCOMAT because some kernels require it.
2211  */
2212
2213 static isc_once_t         bsdcompat_once = ISC_ONCE_INIT;
2214 isc_boolean_t bsdcompat = ISC_TRUE;
2215
2216 static void
2217 clear_bsdcompat(void) {
2218 #ifdef __linux__
2219          struct utsname buf;
2220          char *endp;
2221          long int major;
2222          long int minor;
2223
2224          uname(&buf);    /* Can only fail if buf is bad in Linux. */
2225
2226          /* Paranoia in parsing can be increased, but we trust uname(). */
2227          major = strtol(buf.release, &endp, 10);
2228          if (*endp == '.') {
2229                 minor = strtol(endp+1, &endp, 10);
2230                 if ((major > 2) || ((major == 2) && (minor >= 4))) {
2231                         bsdcompat = ISC_FALSE;
2232                 }
2233          }
2234 #endif /* __linux __ */
2235 }
2236 #endif
2237
2238 static void
2239 use_min_mtu(isc__socket_t *sock) {
2240 #if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU)
2241         UNUSED(sock);
2242 #endif
2243 #ifdef IPV6_USE_MIN_MTU
2244         /* use minimum MTU */
2245         if (sock->pf == AF_INET6) {
2246                 int on = 1;
2247                 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
2248                                 (void *)&on, sizeof(on));
2249         }
2250 #endif
2251 #if defined(IPV6_MTU)
2252         /*
2253          * Use minimum MTU on IPv6 sockets.
2254          */
2255         if (sock->pf == AF_INET6) {
2256                 int mtu = 1280;
2257                 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU,
2258                                  &mtu, sizeof(mtu));
2259         }
2260 #endif
2261 }
2262
2263 static isc_result_t
2264 opensocket(isc__socketmgr_t *manager, isc__socket_t *sock) {
2265         isc_result_t result;
2266         char strbuf[ISC_STRERRORSIZE];
2267         const char *err = "socket";
2268         int tries = 0;
2269 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT) || defined(SO_NOSIGPIPE)
2270         int on = 1;
2271 #endif
2272 #if defined(SO_RCVBUF)
2273         ISC_SOCKADDR_LEN_T optlen;
2274         int size;
2275 #endif
2276
2277  again:
2278         switch (sock->type) {
2279         case isc_sockettype_udp:
2280                 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2281                 break;
2282         case isc_sockettype_tcp:
2283                 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2284                 break;
2285         case isc_sockettype_unix:
2286                 sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2287                 break;
2288         case isc_sockettype_fdwatch:
2289                 /*
2290                  * We should not be called for isc_sockettype_fdwatch sockets.
2291                  */
2292                 INSIST(0);
2293                 break;
2294         }
2295         if (sock->fd == -1 && errno == EINTR && tries++ < 42)
2296                 goto again;
2297
2298 #ifdef F_DUPFD
2299         /*
2300          * Leave a space for stdio and TCP to work in.
2301          */
2302         if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2303             sock->fd >= 0 && sock->fd < manager->reserved) {
2304                 int new, tmp;
2305                 new = fcntl(sock->fd, F_DUPFD, manager->reserved);
2306                 tmp = errno;
2307                 (void)close(sock->fd);
2308                 errno = tmp;
2309                 sock->fd = new;
2310                 err = "isc_socket_create: fcntl/reserved";
2311         } else if (sock->fd >= 0 && sock->fd < 20) {
2312                 int new, tmp;
2313                 new = fcntl(sock->fd, F_DUPFD, 20);
2314                 tmp = errno;
2315                 (void)close(sock->fd);
2316                 errno = tmp;
2317                 sock->fd = new;
2318                 err = "isc_socket_create: fcntl";
2319         }
2320 #endif
2321
2322         if (sock->fd >= (int)manager->maxsocks) {
2323                 (void)close(sock->fd);
2324                 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2325                                ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2326                                isc_msgcat, ISC_MSGSET_SOCKET,
2327                                ISC_MSG_TOOMANYFDS,
2328                                "socket: file descriptor exceeds limit (%d/%u)",
2329                                sock->fd, manager->maxsocks);
2330                 return (ISC_R_NORESOURCES);
2331         }
2332
2333         if (sock->fd < 0) {
2334                 switch (errno) {
2335                 case EMFILE:
2336                 case ENFILE:
2337                         isc__strerror(errno, strbuf, sizeof(strbuf));
2338                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2339                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2340                                        isc_msgcat, ISC_MSGSET_SOCKET,
2341                                        ISC_MSG_TOOMANYFDS,
2342                                        "%s: %s", err, strbuf);
2343                         /* fallthrough */
2344                 case ENOBUFS:
2345                         return (ISC_R_NORESOURCES);
2346
2347                 case EPROTONOSUPPORT:
2348                 case EPFNOSUPPORT:
2349                 case EAFNOSUPPORT:
2350                 /*
2351                  * Linux 2.2 (and maybe others) return EINVAL instead of
2352                  * EAFNOSUPPORT.
2353                  */
2354                 case EINVAL:
2355                         return (ISC_R_FAMILYNOSUPPORT);
2356
2357                 default:
2358                         isc__strerror(errno, strbuf, sizeof(strbuf));
2359                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2360                                          "%s() %s: %s", err,
2361                                          isc_msgcat_get(isc_msgcat,
2362                                                         ISC_MSGSET_GENERAL,
2363                                                         ISC_MSG_FAILED,
2364                                                         "failed"),
2365                                          strbuf);
2366                         return (ISC_R_UNEXPECTED);
2367                 }
2368         }
2369
2370         result = make_nonblock(sock->fd);
2371         if (result != ISC_R_SUCCESS) {
2372                 (void)close(sock->fd);
2373                 return (result);
2374         }
2375
2376 #ifdef SO_BSDCOMPAT
2377         RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
2378                                   clear_bsdcompat) == ISC_R_SUCCESS);
2379         if (sock->type != isc_sockettype_unix && bsdcompat &&
2380             setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
2381                        (void *)&on, sizeof(on)) < 0) {
2382                 isc__strerror(errno, strbuf, sizeof(strbuf));
2383                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2384                                  "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
2385                                  sock->fd,
2386                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2387                                                 ISC_MSG_FAILED, "failed"),
2388                                  strbuf);
2389                 /* Press on... */
2390         }
2391 #endif
2392
2393 #ifdef SO_NOSIGPIPE
2394         if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE,
2395                        (void *)&on, sizeof(on)) < 0) {
2396                 isc__strerror(errno, strbuf, sizeof(strbuf));
2397                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2398                                  "setsockopt(%d, SO_NOSIGPIPE) %s: %s",
2399                                  sock->fd,
2400                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2401                                                 ISC_MSG_FAILED, "failed"),
2402                                  strbuf);
2403                 /* Press on... */
2404         }
2405 #endif
2406
2407         /*
2408          * Use minimum mtu if possible.
2409          */
2410         use_min_mtu(sock);
2411
2412 #if defined(USE_CMSG) || defined(SO_RCVBUF)
2413         if (sock->type == isc_sockettype_udp) {
2414
2415 #if defined(USE_CMSG)
2416 #if defined(SO_TIMESTAMP)
2417                 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
2418                                (void *)&on, sizeof(on)) < 0
2419                     && errno != ENOPROTOOPT) {
2420                         isc__strerror(errno, strbuf, sizeof(strbuf));
2421                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2422                                          "setsockopt(%d, SO_TIMESTAMP) %s: %s",
2423                                          sock->fd,
2424                                          isc_msgcat_get(isc_msgcat,
2425                                                         ISC_MSGSET_GENERAL,
2426                                                         ISC_MSG_FAILED,
2427                                                         "failed"),
2428                                          strbuf);
2429                         /* Press on... */
2430                 }
2431 #endif /* SO_TIMESTAMP */
2432
2433 #if defined(ISC_PLATFORM_HAVEIPV6)
2434                 if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
2435                         /*
2436                          * Warn explicitly because this anomaly can be hidden
2437                          * in usual operation (and unexpectedly appear later).
2438                          */
2439                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2440                                          "No buffer available to receive "
2441                                          "IPv6 destination");
2442                 }
2443 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
2444 #ifdef IPV6_RECVPKTINFO
2445                 /* RFC 3542 */
2446                 if ((sock->pf == AF_INET6)
2447                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2448                                    (void *)&on, sizeof(on)) < 0)) {
2449                         isc__strerror(errno, strbuf, sizeof(strbuf));
2450                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2451                                          "setsockopt(%d, IPV6_RECVPKTINFO) "
2452                                          "%s: %s", sock->fd,
2453                                          isc_msgcat_get(isc_msgcat,
2454                                                         ISC_MSGSET_GENERAL,
2455                                                         ISC_MSG_FAILED,
2456                                                         "failed"),
2457                                          strbuf);
2458                 }
2459 #else
2460                 /* RFC 2292 */
2461                 if ((sock->pf == AF_INET6)
2462                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2463                                    (void *)&on, sizeof(on)) < 0)) {
2464                         isc__strerror(errno, strbuf, sizeof(strbuf));
2465                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2466                                          "setsockopt(%d, IPV6_PKTINFO) %s: %s",
2467                                          sock->fd,
2468                                          isc_msgcat_get(isc_msgcat,
2469                                                         ISC_MSGSET_GENERAL,
2470                                                         ISC_MSG_FAILED,
2471                                                         "failed"),
2472                                          strbuf);
2473                 }
2474 #endif /* IPV6_RECVPKTINFO */
2475 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
2476 #if defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DONT)
2477                 /*
2478                  * Turn off Path MTU discovery on IPv6/UDP sockets.
2479                  */
2480                 if (sock->pf == AF_INET6) {
2481                         int action = IPV6_PMTUDISC_DONT;
2482                         (void)setsockopt(sock->fd, IPPROTO_IPV6,
2483                                          IPV6_MTU_DISCOVER, &action,
2484                                          sizeof(action));
2485                 }
2486 #endif
2487 #endif /* ISC_PLATFORM_HAVEIPV6 */
2488 #endif /* defined(USE_CMSG) */
2489
2490 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
2491                 /*
2492                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2493                  */
2494                 if (sock->pf == AF_INET) {
2495                         int action = IP_PMTUDISC_DONT;
2496                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2497                                          &action, sizeof(action));
2498                 }
2499 #endif
2500 #if defined(IP_DONTFRAG)
2501                 /*
2502                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2503                  */
2504                 if (sock->pf == AF_INET) {
2505                         int off = 0;
2506                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG,
2507                                          &off, sizeof(off));
2508                 }
2509 #endif
2510
2511 #if defined(SO_RCVBUF)
2512                 optlen = sizeof(size);
2513                 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2514                                (void *)&size, &optlen) >= 0 &&
2515                      size < RCVBUFSIZE) {
2516                         size = RCVBUFSIZE;
2517                         if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2518                                        (void *)&size, sizeof(size)) == -1) {
2519                                 isc__strerror(errno, strbuf, sizeof(strbuf));
2520                                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2521                                         "setsockopt(%d, SO_RCVBUF, %d) %s: %s",
2522                                         sock->fd, size,
2523                                         isc_msgcat_get(isc_msgcat,
2524                                                        ISC_MSGSET_GENERAL,
2525                                                        ISC_MSG_FAILED,
2526                                                        "failed"),
2527                                         strbuf);
2528                         }
2529                 }
2530 #endif
2531         }
2532 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
2533
2534         inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2535
2536         return (ISC_R_SUCCESS);
2537 }
2538
2539 /*%
2540  * Create a new 'type' socket managed by 'manager'.  Events
2541  * will be posted to 'task' and when dispatched 'action' will be
2542  * called with 'arg' as the arg value.  The new socket is returned
2543  * in 'socketp'.
2544  */
2545 ISC_SOCKETFUNC_SCOPE isc_result_t
2546 isc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
2547                    isc_socket_t **socketp)
2548 {
2549         isc__socket_t *sock = NULL;
2550         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2551         isc_result_t result;
2552         int lockid;
2553
2554         REQUIRE(VALID_MANAGER(manager));
2555         REQUIRE(socketp != NULL && *socketp == NULL);
2556         REQUIRE(type != isc_sockettype_fdwatch);
2557
2558         result = allocate_socket(manager, type, &sock);
2559         if (result != ISC_R_SUCCESS)
2560                 return (result);
2561
2562         switch (sock->type) {
2563         case isc_sockettype_udp:
2564                 sock->statsindex =
2565                         (pf == AF_INET) ? udp4statsindex : udp6statsindex;
2566                 break;
2567         case isc_sockettype_tcp:
2568                 sock->statsindex =
2569                         (pf == AF_INET) ? tcp4statsindex : tcp6statsindex;
2570                 break;
2571         case isc_sockettype_unix:
2572                 sock->statsindex = unixstatsindex;
2573                 break;
2574         default:
2575                 INSIST(0);
2576         }
2577
2578         sock->pf = pf;
2579         result = opensocket(manager, sock);
2580         if (result != ISC_R_SUCCESS) {
2581                 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2582                 free_socket(&sock);
2583                 return (result);
2584         }
2585
2586         sock->common.methods = (isc_socketmethods_t *)&socketmethods;
2587         sock->references = 1;
2588         *socketp = (isc_socket_t *)sock;
2589
2590         /*
2591          * Note we don't have to lock the socket like we normally would because
2592          * there are no external references to it yet.
2593          */
2594
2595         lockid = FDLOCK_ID(sock->fd);
2596         LOCK(&manager->fdlock[lockid]);
2597         manager->fds[sock->fd] = sock;
2598         manager->fdstate[sock->fd] = MANAGED;
2599 #ifdef USE_DEVPOLL
2600         INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2601                sock->manager->fdpollinfo[sock->fd].want_write == 0);
2602 #endif
2603         UNLOCK(&manager->fdlock[lockid]);
2604
2605         LOCK(&manager->lock);
2606         ISC_LIST_APPEND(manager->socklist, sock, link);
2607 #ifdef USE_SELECT
2608         if (manager->maxfd < sock->fd)
2609                 manager->maxfd = sock->fd;
2610 #endif
2611         UNLOCK(&manager->lock);
2612
2613         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2614                    ISC_MSG_CREATED, "created");
2615
2616         return (ISC_R_SUCCESS);
2617 }
2618
2619 #ifdef BIND9
2620 ISC_SOCKETFUNC_SCOPE isc_result_t
2621 isc__socket_open(isc_socket_t *sock0) {
2622         isc_result_t result;
2623         isc__socket_t *sock = (isc__socket_t *)sock0;
2624
2625         REQUIRE(VALID_SOCKET(sock));
2626
2627         LOCK(&sock->lock);
2628         REQUIRE(sock->references == 1);
2629         REQUIRE(sock->type != isc_sockettype_fdwatch);
2630         UNLOCK(&sock->lock);
2631         /*
2632          * We don't need to retain the lock hereafter, since no one else has
2633          * this socket.
2634          */
2635         REQUIRE(sock->fd == -1);
2636
2637         result = opensocket(sock->manager, sock);
2638         if (result != ISC_R_SUCCESS)
2639                 sock->fd = -1;
2640
2641         if (result == ISC_R_SUCCESS) {
2642                 int lockid = FDLOCK_ID(sock->fd);
2643
2644                 LOCK(&sock->manager->fdlock[lockid]);
2645                 sock->manager->fds[sock->fd] = sock;
2646                 sock->manager->fdstate[sock->fd] = MANAGED;
2647 #ifdef USE_DEVPOLL
2648                 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2649                        sock->manager->fdpollinfo[sock->fd].want_write == 0);
2650 #endif
2651                 UNLOCK(&sock->manager->fdlock[lockid]);
2652
2653 #ifdef USE_SELECT
2654                 LOCK(&sock->manager->lock);
2655                 if (sock->manager->maxfd < sock->fd)
2656                         sock->manager->maxfd = sock->fd;
2657                 UNLOCK(&sock->manager->lock);
2658 #endif
2659         }
2660
2661         return (result);
2662 }
2663 #endif  /* BIND9 */
2664
2665 /*
2666  * Create a new 'type' socket managed by 'manager'.  Events
2667  * will be posted to 'task' and when dispatched 'action' will be
2668  * called with 'arg' as the arg value.  The new socket is returned
2669  * in 'socketp'.
2670  */
2671 ISC_SOCKETFUNC_SCOPE isc_result_t
2672 isc__socket_fdwatchcreate(isc_socketmgr_t *manager0, int fd, int flags,
2673                           isc_sockfdwatch_t callback, void *cbarg,
2674                           isc_task_t *task, isc_socket_t **socketp)
2675 {
2676         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2677         isc__socket_t *sock = NULL;
2678         isc_result_t result;
2679         int lockid;
2680
2681         REQUIRE(VALID_MANAGER(manager));
2682         REQUIRE(socketp != NULL && *socketp == NULL);
2683
2684         result = allocate_socket(manager, isc_sockettype_fdwatch, &sock);
2685         if (result != ISC_R_SUCCESS)
2686                 return (result);
2687
2688         sock->fd = fd;
2689         sock->fdwatcharg = cbarg;
2690         sock->fdwatchcb = callback;
2691         sock->fdwatchflags = flags;
2692         sock->fdwatchtask = task;
2693         sock->statsindex = fdwatchstatsindex;
2694
2695         sock->common.methods = (isc_socketmethods_t *)&socketmethods;
2696         sock->references = 1;
2697         *socketp = (isc_socket_t *)sock;
2698
2699         /*
2700          * Note we don't have to lock the socket like we normally would because
2701          * there are no external references to it yet.
2702          */
2703
2704         lockid = FDLOCK_ID(sock->fd);
2705         LOCK(&manager->fdlock[lockid]);
2706         manager->fds[sock->fd] = sock;
2707         manager->fdstate[sock->fd] = MANAGED;
2708         UNLOCK(&manager->fdlock[lockid]);
2709
2710         LOCK(&manager->lock);
2711         ISC_LIST_APPEND(manager->socklist, sock, link);
2712 #ifdef USE_SELECT
2713         if (manager->maxfd < sock->fd)
2714                 manager->maxfd = sock->fd;
2715 #endif
2716         UNLOCK(&manager->lock);
2717
2718         if (flags & ISC_SOCKFDWATCH_READ)
2719                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2720         if (flags & ISC_SOCKFDWATCH_WRITE)
2721                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2722
2723         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2724                    ISC_MSG_CREATED, "fdwatch-created");
2725
2726         return (ISC_R_SUCCESS);
2727 }
2728
2729 /*
2730  * Indicate to the manager that it should watch the socket again.
2731  * This can be used to restart watching if the previous event handler
2732  * didn't indicate there was more data to be processed.  Primarily
2733  * it is for writing but could be used for reading if desired
2734  */
2735
2736 ISC_SOCKETFUNC_SCOPE isc_result_t
2737 isc__socket_fdwatchpoke(isc_socket_t *sock0, int flags)
2738 {
2739         isc__socket_t *sock = (isc__socket_t *)sock0;
2740
2741         REQUIRE(VALID_SOCKET(sock));
2742
2743         /*
2744          * We check both flags first to allow us to get the lock
2745          * once but only if we need it.
2746          */
2747
2748         if ((flags & (ISC_SOCKFDWATCH_READ | ISC_SOCKFDWATCH_WRITE)) != 0) {
2749                 LOCK(&sock->lock);
2750                 if (((flags & ISC_SOCKFDWATCH_READ) != 0) &&
2751                     !sock->pending_recv)
2752                         select_poke(sock->manager, sock->fd,
2753                                     SELECT_POKE_READ);
2754                 if (((flags & ISC_SOCKFDWATCH_WRITE) != 0) &&
2755                     !sock->pending_send)
2756                         select_poke(sock->manager, sock->fd,
2757                                     SELECT_POKE_WRITE);
2758                 UNLOCK(&sock->lock);
2759         }
2760
2761         socket_log(sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET,
2762                    ISC_MSG_POKED, "fdwatch-poked flags: %d", flags);
2763
2764         return (ISC_R_SUCCESS);
2765 }
2766
2767 /*
2768  * Attach to a socket.  Caller must explicitly detach when it is done.
2769  */
2770 ISC_SOCKETFUNC_SCOPE void
2771 isc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
2772         isc__socket_t *sock = (isc__socket_t *)sock0;
2773
2774         REQUIRE(VALID_SOCKET(sock));
2775         REQUIRE(socketp != NULL && *socketp == NULL);
2776
2777         LOCK(&sock->lock);
2778         sock->references++;
2779         UNLOCK(&sock->lock);
2780
2781         *socketp = (isc_socket_t *)sock;
2782 }
2783
2784 /*
2785  * Dereference a socket.  If this is the last reference to it, clean things
2786  * up by destroying the socket.
2787  */
2788 ISC_SOCKETFUNC_SCOPE void
2789 isc__socket_detach(isc_socket_t **socketp) {
2790         isc__socket_t *sock;
2791         isc_boolean_t kill_socket = ISC_FALSE;
2792
2793         REQUIRE(socketp != NULL);
2794         sock = (isc__socket_t *)*socketp;
2795         REQUIRE(VALID_SOCKET(sock));
2796
2797         LOCK(&sock->lock);
2798         REQUIRE(sock->references > 0);
2799         sock->references--;
2800         if (sock->references == 0)
2801                 kill_socket = ISC_TRUE;
2802         UNLOCK(&sock->lock);
2803
2804         if (kill_socket)
2805                 destroy(&sock);
2806
2807         *socketp = NULL;
2808 }
2809
2810 #ifdef BIND9
2811 ISC_SOCKETFUNC_SCOPE isc_result_t
2812 isc__socket_close(isc_socket_t *sock0) {
2813         isc__socket_t *sock = (isc__socket_t *)sock0;
2814         int fd;
2815         isc__socketmgr_t *manager;
2816
2817         REQUIRE(VALID_SOCKET(sock));
2818
2819         LOCK(&sock->lock);
2820
2821         REQUIRE(sock->references == 1);
2822         REQUIRE(sock->type != isc_sockettype_fdwatch);
2823         REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2824
2825         INSIST(!sock->connecting);
2826         INSIST(!sock->pending_recv);
2827         INSIST(!sock->pending_send);
2828         INSIST(!sock->pending_accept);
2829         INSIST(ISC_LIST_EMPTY(sock->recv_list));
2830         INSIST(ISC_LIST_EMPTY(sock->send_list));
2831         INSIST(ISC_LIST_EMPTY(sock->accept_list));
2832         INSIST(sock->connect_ev == NULL);
2833
2834         manager = sock->manager;
2835         fd = sock->fd;
2836         sock->fd = -1;
2837         memset(sock->name, 0, sizeof(sock->name));
2838         sock->tag = NULL;
2839         sock->listener = 0;
2840         sock->connected = 0;
2841         sock->connecting = 0;
2842         sock->bound = 0;
2843         isc_sockaddr_any(&sock->peer_address);
2844
2845         UNLOCK(&sock->lock);
2846
2847         closesocket(manager, sock, fd);
2848
2849         return (ISC_R_SUCCESS);
2850 }
2851 #endif  /* BIND9 */
2852
2853 /*
2854  * I/O is possible on a given socket.  Schedule an event to this task that
2855  * will call an internal function to do the I/O.  This will charge the
2856  * task with the I/O operation and let our select loop handler get back
2857  * to doing something real as fast as possible.
2858  *
2859  * The socket and manager must be locked before calling this function.
2860  */
2861 static void
2862 dispatch_recv(isc__socket_t *sock) {
2863         intev_t *iev;
2864         isc_socketevent_t *ev;
2865         isc_task_t *sender;
2866
2867         INSIST(!sock->pending_recv);
2868
2869         if (sock->type != isc_sockettype_fdwatch) {
2870                 ev = ISC_LIST_HEAD(sock->recv_list);
2871                 if (ev == NULL)
2872                         return;
2873                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2874                            "dispatch_recv:  event %p -> task %p",
2875                            ev, ev->ev_sender);
2876                 sender = ev->ev_sender;
2877         } else {
2878                 sender = sock->fdwatchtask;
2879         }
2880
2881         sock->pending_recv = 1;
2882         iev = &sock->readable_ev;
2883
2884         sock->references++;
2885         iev->ev_sender = sock;
2886         if (sock->type == isc_sockettype_fdwatch)
2887                 iev->ev_action = internal_fdwatch_read;
2888         else
2889                 iev->ev_action = internal_recv;
2890         iev->ev_arg = sock;
2891
2892         isc_task_send(sender, (isc_event_t **)&iev);
2893 }
2894
2895 static void
2896 dispatch_send(isc__socket_t *sock) {
2897         intev_t *iev;
2898         isc_socketevent_t *ev;
2899         isc_task_t *sender;
2900
2901         INSIST(!sock->pending_send);
2902
2903         if (sock->type != isc_sockettype_fdwatch) {
2904                 ev = ISC_LIST_HEAD(sock->send_list);
2905                 if (ev == NULL)
2906                         return;
2907                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2908                            "dispatch_send:  event %p -> task %p",
2909                            ev, ev->ev_sender);
2910                 sender = ev->ev_sender;
2911         } else {
2912                 sender = sock->fdwatchtask;
2913         }
2914
2915         sock->pending_send = 1;
2916         iev = &sock->writable_ev;
2917
2918         sock->references++;
2919         iev->ev_sender = sock;
2920         if (sock->type == isc_sockettype_fdwatch)
2921                 iev->ev_action = internal_fdwatch_write;
2922         else
2923                 iev->ev_action = internal_send;
2924         iev->ev_arg = sock;
2925
2926         isc_task_send(sender, (isc_event_t **)&iev);
2927 }
2928
2929 /*
2930  * Dispatch an internal accept event.
2931  */
2932 static void
2933 dispatch_accept(isc__socket_t *sock) {
2934         intev_t *iev;
2935         isc_socket_newconnev_t *ev;
2936
2937         INSIST(!sock->pending_accept);
2938
2939         /*
2940          * Are there any done events left, or were they all canceled
2941          * before the manager got the socket lock?
2942          */
2943         ev = ISC_LIST_HEAD(sock->accept_list);
2944         if (ev == NULL)
2945                 return;
2946
2947         sock->pending_accept = 1;
2948         iev = &sock->readable_ev;
2949
2950         sock->references++;  /* keep socket around for this internal event */
2951         iev->ev_sender = sock;
2952         iev->ev_action = internal_accept;
2953         iev->ev_arg = sock;
2954
2955         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2956 }
2957
2958 static void
2959 dispatch_connect(isc__socket_t *sock) {
2960         intev_t *iev;
2961         isc_socket_connev_t *ev;
2962
2963         iev = &sock->writable_ev;
2964
2965         ev = sock->connect_ev;
2966         INSIST(ev != NULL); /* XXX */
2967
2968         INSIST(sock->connecting);
2969
2970         sock->references++;  /* keep socket around for this internal event */
2971         iev->ev_sender = sock;
2972         iev->ev_action = internal_connect;
2973         iev->ev_arg = sock;
2974
2975         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2976 }
2977
2978 /*
2979  * Dequeue an item off the given socket's read queue, set the result code
2980  * in the done event to the one provided, and send it to the task it was
2981  * destined for.
2982  *
2983  * If the event to be sent is on a list, remove it before sending.  If
2984  * asked to, send and detach from the socket as well.
2985  *
2986  * Caller must have the socket locked if the event is attached to the socket.
2987  */
2988 static void
2989 send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
2990         isc_task_t *task;
2991
2992         task = (*dev)->ev_sender;
2993
2994         (*dev)->ev_sender = sock;
2995
2996         if (ISC_LINK_LINKED(*dev, ev_link))
2997                 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2998
2999         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
3000             == ISC_SOCKEVENTATTR_ATTACHED)
3001                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
3002         else
3003                 isc_task_send(task, (isc_event_t **)dev);
3004 }
3005
3006 /*
3007  * See comments for send_recvdone_event() above.
3008  *
3009  * Caller must have the socket locked if the event is attached to the socket.
3010  */
3011 static void
3012 send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
3013         isc_task_t *task;
3014
3015         INSIST(dev != NULL && *dev != NULL);
3016
3017         task = (*dev)->ev_sender;
3018         (*dev)->ev_sender = sock;
3019
3020         if (ISC_LINK_LINKED(*dev, ev_link))
3021                 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
3022
3023         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
3024             == ISC_SOCKEVENTATTR_ATTACHED)
3025                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
3026         else
3027                 isc_task_send(task, (isc_event_t **)dev);
3028 }
3029
3030 /*
3031  * Call accept() on a socket, to get the new file descriptor.  The listen
3032  * socket is used as a prototype to create a new isc_socket_t.  The new
3033  * socket has one outstanding reference.  The task receiving the event
3034  * will be detached from just after the event is delivered.
3035  *
3036  * On entry to this function, the event delivered is the internal
3037  * readable event, and the first item on the accept_list should be
3038  * the done event we want to send.  If the list is empty, this is a no-op,
3039  * so just unlock and return.
3040  */
3041 static void
3042 internal_accept(isc_task_t *me, isc_event_t *ev) {
3043         isc__socket_t *sock;
3044         isc__socketmgr_t *manager;
3045         isc_socket_newconnev_t *dev;
3046         isc_task_t *task;
3047         ISC_SOCKADDR_LEN_T addrlen;
3048         int fd;
3049         isc_result_t result = ISC_R_SUCCESS;
3050         char strbuf[ISC_STRERRORSIZE];
3051         const char *err = "accept";
3052
3053         UNUSED(me);
3054
3055         sock = ev->ev_sender;
3056         INSIST(VALID_SOCKET(sock));
3057
3058         LOCK(&sock->lock);
3059         socket_log(sock, NULL, TRACE,
3060                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
3061                    "internal_accept called, locked socket");
3062
3063         manager = sock->manager;
3064         INSIST(VALID_MANAGER(manager));
3065
3066         INSIST(sock->listener);
3067         INSIST(sock->pending_accept == 1);
3068         sock->pending_accept = 0;
3069
3070         INSIST(sock->references > 0);
3071         sock->references--;  /* the internal event is done with this socket */
3072         if (sock->references == 0) {
3073                 UNLOCK(&sock->lock);
3074                 destroy(&sock);
3075                 return;
3076         }
3077
3078         /*
3079          * Get the first item off the accept list.
3080          * If it is empty, unlock the socket and return.
3081          */
3082         dev = ISC_LIST_HEAD(sock->accept_list);
3083         if (dev == NULL) {
3084                 UNLOCK(&sock->lock);
3085                 return;
3086         }
3087
3088         /*
3089          * Try to accept the new connection.  If the accept fails with
3090          * EAGAIN or EINTR, simply poke the watcher to watch this socket
3091          * again.  Also ignore ECONNRESET, which has been reported to
3092          * be spuriously returned on Linux 2.2.19 although it is not
3093          * a documented error for accept().  ECONNABORTED has been
3094          * reported for Solaris 8.  The rest are thrown in not because
3095          * we have seen them but because they are ignored by other
3096          * daemons such as BIND 8 and Apache.
3097          */
3098
3099         addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
3100         memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
3101         fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
3102                     (void *)&addrlen);
3103
3104 #ifdef F_DUPFD
3105         /*
3106          * Leave a space for stdio to work in.
3107          */
3108         if (fd >= 0 && fd < 20) {
3109                 int new, tmp;
3110                 new = fcntl(fd, F_DUPFD, 20);
3111                 tmp = errno;
3112                 (void)close(fd);
3113                 errno = tmp;
3114                 fd = new;
3115                 err = "accept/fcntl";
3116         }
3117 #endif
3118
3119         if (fd < 0) {
3120                 if (SOFT_ERROR(errno))
3121                         goto soft_error;
3122                 switch (errno) {
3123                 case ENFILE:
3124                 case EMFILE:
3125                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3126                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3127                                        isc_msgcat, ISC_MSGSET_SOCKET,
3128                                        ISC_MSG_TOOMANYFDS,
3129                                        "%s: too many open file descriptors",
3130                                        err);
3131                         goto soft_error;
3132
3133                 case ENOBUFS:
3134                 case ENOMEM:
3135                 case ECONNRESET:
3136                 case ECONNABORTED:
3137                 case EHOSTUNREACH:
3138                 case EHOSTDOWN:
3139                 case ENETUNREACH:
3140                 case ENETDOWN:
3141                 case ECONNREFUSED:
3142 #ifdef EPROTO
3143                 case EPROTO:
3144 #endif
3145 #ifdef ENONET
3146                 case ENONET:
3147 #endif
3148                         goto soft_error;
3149                 default:
3150                         break;
3151                 }
3152                 isc__strerror(errno, strbuf, sizeof(strbuf));
3153                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3154                                  "internal_accept: %s() %s: %s", err,
3155                                  isc_msgcat_get(isc_msgcat,
3156                                                 ISC_MSGSET_GENERAL,
3157                                                 ISC_MSG_FAILED,
3158                                                 "failed"),
3159                                  strbuf);
3160                 fd = -1;
3161                 result = ISC_R_UNEXPECTED;
3162         } else {
3163                 if (addrlen == 0U) {
3164                         UNEXPECTED_ERROR(__FILE__, __LINE__,
3165                                          "internal_accept(): "
3166                                          "accept() failed to return "
3167                                          "remote address");
3168
3169                         (void)close(fd);
3170                         goto soft_error;
3171                 } else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
3172                            sock->pf)
3173                 {
3174                         UNEXPECTED_ERROR(__FILE__, __LINE__,
3175                                          "internal_accept(): "
3176                                          "accept() returned peer address "
3177                                          "family %u (expected %u)",
3178                                          NEWCONNSOCK(dev)->peer_address.
3179                                          type.sa.sa_family,
3180                                          sock->pf);
3181                         (void)close(fd);
3182                         goto soft_error;
3183                 } else if (fd >= (int)manager->maxsocks) {
3184                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3185                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3186                                        isc_msgcat, ISC_MSGSET_SOCKET,
3187                                        ISC_MSG_TOOMANYFDS,
3188                                        "accept: "
3189                                        "file descriptor exceeds limit (%d/%u)",
3190                                        fd, manager->maxsocks);
3191                         (void)close(fd);
3192                         goto soft_error;
3193                 }
3194         }
3195
3196         if (fd != -1) {
3197                 NEWCONNSOCK(dev)->peer_address.length = addrlen;
3198                 NEWCONNSOCK(dev)->pf = sock->pf;
3199         }
3200
3201         /*
3202          * Pull off the done event.
3203          */
3204         ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
3205
3206         /*
3207          * Poke watcher if there are more pending accepts.
3208          */
3209         if (!ISC_LIST_EMPTY(sock->accept_list))
3210                 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
3211
3212         UNLOCK(&sock->lock);
3213
3214         if (fd != -1) {
3215                 result = make_nonblock(fd);
3216                 if (result != ISC_R_SUCCESS) {
3217                         (void)close(fd);
3218                         fd = -1;
3219                 }
3220         }
3221
3222         /*
3223          * -1 means the new socket didn't happen.
3224          */
3225         if (fd != -1) {
3226                 int lockid = FDLOCK_ID(fd);
3227
3228                 NEWCONNSOCK(dev)->fd = fd;
3229                 NEWCONNSOCK(dev)->bound = 1;
3230                 NEWCONNSOCK(dev)->connected = 1;
3231
3232                 /*
3233                  * Use minimum mtu if possible.
3234                  */
3235                 use_min_mtu(NEWCONNSOCK(dev));
3236
3237                 /*
3238                  * Save away the remote address
3239                  */
3240                 dev->address = NEWCONNSOCK(dev)->peer_address;
3241
3242                 LOCK(&manager->fdlock[lockid]);
3243                 manager->fds[fd] = NEWCONNSOCK(dev);
3244                 manager->fdstate[fd] = MANAGED;
3245                 UNLOCK(&manager->fdlock[lockid]);
3246
3247                 LOCK(&manager->lock);
3248
3249 #ifdef USE_SELECT
3250                 if (manager->maxfd < fd)
3251                         manager->maxfd = fd;
3252 #endif
3253
3254                 socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
3255                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
3256                            "accepted connection, new socket %p",
3257                            dev->newsocket);
3258
3259                 ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
3260
3261                 UNLOCK(&manager->lock);
3262
3263                 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
3264         } else {
3265                 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3266                 NEWCONNSOCK(dev)->references--;
3267                 free_socket((isc__socket_t **)&dev->newsocket);
3268         }
3269
3270         /*
3271          * Fill in the done event details and send it off.
3272          */
3273         dev->result = result;
3274         task = dev->ev_sender;
3275         dev->ev_sender = sock;
3276
3277         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
3278         return;
3279
3280  soft_error:
3281         select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
3282         UNLOCK(&sock->lock);
3283
3284         inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3285         return;
3286 }
3287
3288 static void
3289 internal_recv(isc_task_t *me, isc_event_t *ev) {
3290         isc_socketevent_t *dev;
3291         isc__socket_t *sock;
3292
3293         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3294
3295         sock = ev->ev_sender;
3296         INSIST(VALID_SOCKET(sock));
3297
3298         LOCK(&sock->lock);
3299         socket_log(sock, NULL, IOEVENT,
3300                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3301                    "internal_recv: task %p got event %p", me, ev);
3302
3303         INSIST(sock->pending_recv == 1);
3304         sock->pending_recv = 0;
3305
3306         INSIST(sock->references > 0);
3307         sock->references--;  /* the internal event is done with this socket */
3308         if (sock->references == 0) {
3309                 UNLOCK(&sock->lock);
3310                 destroy(&sock);
3311                 return;
3312         }
3313
3314         /*
3315          * Try to do as much I/O as possible on this socket.  There are no
3316          * limits here, currently.
3317          */
3318         dev = ISC_LIST_HEAD(sock->recv_list);
3319         while (dev != NULL) {
3320                 switch (doio_recv(sock, dev)) {
3321                 case DOIO_SOFT:
3322                         goto poke;
3323
3324                 case DOIO_EOF:
3325                         /*
3326                          * read of 0 means the remote end was closed.
3327                          * Run through the event queue and dispatch all
3328                          * the events with an EOF result code.
3329                          */
3330                         do {
3331                                 dev->result = ISC_R_EOF;
3332                                 send_recvdone_event(sock, &dev);
3333                                 dev = ISC_LIST_HEAD(sock->recv_list);
3334                         } while (dev != NULL);
3335                         goto poke;
3336
3337                 case DOIO_SUCCESS:
3338                 case DOIO_HARD:
3339                         send_recvdone_event(sock, &dev);
3340                         break;
3341                 }
3342
3343                 dev = ISC_LIST_HEAD(sock->recv_list);
3344         }
3345
3346  poke:
3347         if (!ISC_LIST_EMPTY(sock->recv_list))
3348                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3349
3350         UNLOCK(&sock->lock);
3351 }
3352
3353 static void
3354 internal_send(isc_task_t *me, isc_event_t *ev) {
3355         isc_socketevent_t *dev;
3356         isc__socket_t *sock;
3357
3358         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3359
3360         /*
3361          * Find out what socket this is and lock it.
3362          */
3363         sock = (isc__socket_t *)ev->ev_sender;
3364         INSIST(VALID_SOCKET(sock));
3365
3366         LOCK(&sock->lock);
3367         socket_log(sock, NULL, IOEVENT,
3368                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3369                    "internal_send: task %p got event %p", me, ev);
3370
3371         INSIST(sock->pending_send == 1);
3372         sock->pending_send = 0;
3373
3374         INSIST(sock->references > 0);
3375         sock->references--;  /* the internal event is done with this socket */
3376         if (sock->references == 0) {
3377                 UNLOCK(&sock->lock);
3378                 destroy(&sock);
3379                 return;
3380         }
3381
3382         /*
3383          * Try to do as much I/O as possible on this socket.  There are no
3384          * limits here, currently.
3385          */
3386         dev = ISC_LIST_HEAD(sock->send_list);
3387         while (dev != NULL) {
3388                 switch (doio_send(sock, dev)) {
3389                 case DOIO_SOFT:
3390                         goto poke;
3391
3392                 case DOIO_HARD:
3393                 case DOIO_SUCCESS:
3394                         send_senddone_event(sock, &dev);
3395                         break;
3396                 }
3397
3398                 dev = ISC_LIST_HEAD(sock->send_list);
3399         }
3400
3401  poke:
3402         if (!ISC_LIST_EMPTY(sock->send_list))
3403                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3404
3405         UNLOCK(&sock->lock);
3406 }
3407
3408 static void
3409 internal_fdwatch_write(isc_task_t *me, isc_event_t *ev) {
3410         isc__socket_t *sock;
3411         int more_data;
3412
3413         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3414
3415         /*
3416          * Find out what socket this is and lock it.
3417          */
3418         sock = (isc__socket_t *)ev->ev_sender;
3419         INSIST(VALID_SOCKET(sock));
3420
3421         LOCK(&sock->lock);
3422         socket_log(sock, NULL, IOEVENT,
3423                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3424                    "internal_fdwatch_write: task %p got event %p", me, ev);
3425
3426         INSIST(sock->pending_send == 1);
3427
3428         UNLOCK(&sock->lock);
3429         more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock,
3430                                       sock->fdwatcharg, ISC_SOCKFDWATCH_WRITE);
3431         LOCK(&sock->lock);
3432
3433         sock->pending_send = 0;
3434
3435         INSIST(sock->references > 0);
3436         sock->references--;  /* the internal event is done with this socket */
3437         if (sock->references == 0) {
3438                 UNLOCK(&sock->lock);
3439                 destroy(&sock);
3440                 return;
3441         }
3442
3443         if (more_data)
3444                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3445
3446         UNLOCK(&sock->lock);
3447 }
3448
3449 static void
3450 internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) {
3451         isc__socket_t *sock;
3452         int more_data;
3453
3454         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3455
3456         /*
3457          * Find out what socket this is and lock it.
3458          */
3459         sock = (isc__socket_t *)ev->ev_sender;
3460         INSIST(VALID_SOCKET(sock));
3461
3462         LOCK(&sock->lock);
3463         socket_log(sock, NULL, IOEVENT,
3464                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3465                    "internal_fdwatch_read: task %p got event %p", me, ev);
3466
3467         INSIST(sock->pending_recv == 1);
3468
3469         UNLOCK(&sock->lock);
3470         more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock,
3471                                       sock->fdwatcharg, ISC_SOCKFDWATCH_READ);
3472         LOCK(&sock->lock);
3473
3474         sock->pending_recv = 0;
3475
3476         INSIST(sock->references > 0);
3477         sock->references--;  /* the internal event is done with this socket */
3478         if (sock->references == 0) {
3479                 UNLOCK(&sock->lock);
3480                 destroy(&sock);
3481                 return;
3482         }
3483
3484         if (more_data)
3485                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3486
3487         UNLOCK(&sock->lock);
3488 }
3489
3490 /*
3491  * Process read/writes on each fd here.  Avoid locking
3492  * and unlocking twice if both reads and writes are possible.
3493  */
3494 static void
3495 process_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable,
3496            isc_boolean_t writeable)
3497 {
3498         isc__socket_t *sock;
3499         isc_boolean_t unlock_sock;
3500         isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
3501         int lockid = FDLOCK_ID(fd);
3502
3503         /*
3504          * If the socket is going to be closed, don't do more I/O.
3505          */
3506         LOCK(&manager->fdlock[lockid]);
3507         if (manager->fdstate[fd] == CLOSE_PENDING) {
3508                 UNLOCK(&manager->fdlock[lockid]);
3509
3510                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3511                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3512                 return;
3513         }
3514
3515         sock = manager->fds[fd];
3516         unlock_sock = ISC_FALSE;
3517         if (readable) {
3518                 if (sock == NULL) {
3519                         unwatch_read = ISC_TRUE;
3520                         goto check_write;
3521                 }
3522                 unlock_sock = ISC_TRUE;
3523                 LOCK(&sock->lock);
3524                 if (!SOCK_DEAD(sock)) {
3525                         if (sock->listener)
3526                                 dispatch_accept(sock);
3527                         else
3528                                 dispatch_recv(sock);
3529                 }
3530                 unwatch_read = ISC_TRUE;
3531         }
3532 check_write:
3533         if (writeable) {
3534                 if (sock == NULL) {
3535                         unwatch_write = ISC_TRUE;
3536                         goto unlock_fd;
3537                 }
3538                 if (!unlock_sock) {
3539                         unlock_sock = ISC_TRUE;
3540                         LOCK(&sock->lock);
3541                 }
3542                 if (!SOCK_DEAD(sock)) {
3543                         if (sock->connecting)
3544                                 dispatch_connect(sock);
3545                         else
3546                                 dispatch_send(sock);
3547                 }
3548                 unwatch_write = ISC_TRUE;
3549         }
3550         if (unlock_sock)
3551                 UNLOCK(&sock->lock);
3552
3553  unlock_fd:
3554         UNLOCK(&manager->fdlock[lockid]);
3555         if (unwatch_read)
3556                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3557         if (unwatch_write)
3558                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3559
3560 }
3561
3562 #ifdef USE_KQUEUE
3563 static isc_boolean_t
3564 process_fds(isc__socketmgr_t *manager, struct kevent *events, int nevents) {
3565         int i;
3566         isc_boolean_t readable, writable;
3567         isc_boolean_t done = ISC_FALSE;
3568 #ifdef USE_WATCHER_THREAD
3569         isc_boolean_t have_ctlevent = ISC_FALSE;
3570 #endif
3571
3572         if (nevents == manager->nevents) {
3573                 /*
3574                  * This is not an error, but something unexpected.  If this
3575                  * happens, it may indicate the need for increasing
3576                  * ISC_SOCKET_MAXEVENTS.
3577                  */
3578                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3579                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3580                             "maximum number of FD events (%d) received",
3581                             nevents);
3582         }
3583
3584         for (i = 0; i < nevents; i++) {
3585                 REQUIRE(events[i].ident < manager->maxsocks);
3586 #ifdef USE_WATCHER_THREAD
3587                 if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) {
3588                         have_ctlevent = ISC_TRUE;
3589                         continue;
3590                 }
3591 #endif
3592                 readable = ISC_TF(events[i].filter == EVFILT_READ);
3593                 writable = ISC_TF(events[i].filter == EVFILT_WRITE);
3594                 process_fd(manager, events[i].ident, readable, writable);
3595         }
3596
3597 #ifdef USE_WATCHER_THREAD
3598         if (have_ctlevent)
3599                 done = process_ctlfd(manager);
3600 #endif
3601
3602         return (done);
3603 }
3604 #elif defined(USE_EPOLL)
3605 static isc_boolean_t
3606 process_fds(isc__socketmgr_t *manager, struct epoll_event *events, int nevents)
3607 {
3608         int i;
3609         isc_boolean_t done = ISC_FALSE;
3610 #ifdef USE_WATCHER_THREAD
3611         isc_boolean_t have_ctlevent = ISC_FALSE;
3612 #endif
3613
3614         if (nevents == manager->nevents) {
3615                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3616                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3617                             "maximum number of FD events (%d) received",
3618                             nevents);
3619         }
3620
3621         for (i = 0; i < nevents; i++) {
3622                 REQUIRE(events[i].data.fd < (int)manager->maxsocks);
3623 #ifdef USE_WATCHER_THREAD
3624                 if (events[i].data.fd == manager->pipe_fds[0]) {
3625                         have_ctlevent = ISC_TRUE;
3626                         continue;
3627                 }
3628 #endif
3629                 if ((events[i].events & EPOLLERR) != 0 ||
3630                     (events[i].events & EPOLLHUP) != 0) {
3631                         /*
3632                          * epoll does not set IN/OUT bits on an erroneous
3633                          * condition, so we need to try both anyway.  This is a
3634                          * bit inefficient, but should be okay for such rare
3635                          * events.  Note also that the read or write attempt
3636                          * won't block because we use non-blocking sockets.
3637                          */
3638                         events[i].events |= (EPOLLIN | EPOLLOUT);
3639                 }
3640                 process_fd(manager, events[i].data.fd,
3641                            (events[i].events & EPOLLIN) != 0,
3642                            (events[i].events & EPOLLOUT) != 0);
3643         }
3644
3645 #ifdef USE_WATCHER_THREAD
3646         if (have_ctlevent)
3647                 done = process_ctlfd(manager);
3648 #endif
3649
3650         return (done);
3651 }
3652 #elif defined(USE_DEVPOLL)
3653 static isc_boolean_t
3654 process_fds(isc__socketmgr_t *manager, struct pollfd *events, int nevents) {
3655         int i;
3656         isc_boolean_t done = ISC_FALSE;
3657 #ifdef USE_WATCHER_THREAD
3658         isc_boolean_t have_ctlevent = ISC_FALSE;
3659 #endif
3660
3661         if (nevents == manager->nevents) {
3662                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3663                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3664                             "maximum number of FD events (%d) received",
3665                             nevents);
3666         }
3667
3668         for (i = 0; i < nevents; i++) {
3669                 REQUIRE(events[i].fd < (int)manager->maxsocks);
3670 #ifdef USE_WATCHER_THREAD
3671                 if (events[i].fd == manager->pipe_fds[0]) {
3672                         have_ctlevent = ISC_TRUE;
3673                         continue;
3674                 }
3675 #endif
3676                 process_fd(manager, events[i].fd,
3677                            (events[i].events & POLLIN) != 0,
3678                            (events[i].events & POLLOUT) != 0);
3679         }
3680
3681 #ifdef USE_WATCHER_THREAD
3682         if (have_ctlevent)
3683                 done = process_ctlfd(manager);
3684 #endif
3685
3686         return (done);
3687 }
3688 #elif defined(USE_SELECT)
3689 static void
3690 process_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds,
3691             fd_set *writefds)
3692 {
3693         int i;
3694
3695         REQUIRE(maxfd <= (int)manager->maxsocks);
3696
3697         for (i = 0; i < maxfd; i++) {
3698 #ifdef USE_WATCHER_THREAD
3699                 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
3700                         continue;
3701 #endif /* USE_WATCHER_THREAD */
3702                 process_fd(manager, i, FD_ISSET(i, readfds),
3703                            FD_ISSET(i, writefds));
3704         }
3705 }
3706 #endif
3707
3708 #ifdef USE_WATCHER_THREAD
3709 static isc_boolean_t
3710 process_ctlfd(isc__socketmgr_t *manager) {
3711         int msg, fd;
3712
3713         for (;;) {
3714                 select_readmsg(manager, &fd, &msg);
3715
3716                 manager_log(manager, IOEVENT,
3717                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
3718                                            ISC_MSG_WATCHERMSG,
3719                                            "watcher got message %d "
3720                                            "for socket %d"), msg, fd);
3721
3722                 /*
3723                  * Nothing to read?
3724                  */
3725                 if (msg == SELECT_POKE_NOTHING)
3726                         break;
3727
3728                 /*
3729                  * Handle shutdown message.  We really should
3730                  * jump out of this loop right away, but
3731                  * it doesn't matter if we have to do a little
3732                  * more work first.
3733                  */
3734                 if (msg == SELECT_POKE_SHUTDOWN)
3735                         return (ISC_TRUE);
3736
3737                 /*
3738                  * This is a wakeup on a socket.  Look
3739                  * at the event queue for both read and write,
3740                  * and decide if we need to watch on it now
3741                  * or not.
3742                  */
3743                 wakeup_socket(manager, fd, msg);
3744         }
3745
3746         return (ISC_FALSE);
3747 }
3748
3749 /*
3750  * This is the thread that will loop forever, always in a select or poll
3751  * call.
3752  *
3753  * When select returns something to do, track down what thread gets to do
3754  * this I/O and post the event to it.
3755  */
3756 static isc_threadresult_t
3757 watcher(void *uap) {
3758         isc__socketmgr_t *manager = uap;
3759         isc_boolean_t done;
3760         int cc;
3761 #ifdef USE_KQUEUE
3762         const char *fnname = "kevent()";
3763 #elif defined (USE_EPOLL)
3764         const char *fnname = "epoll_wait()";
3765 #elif defined(USE_DEVPOLL)
3766         const char *fnname = "ioctl(DP_POLL)";
3767         struct dvpoll dvp;
3768 #elif defined (USE_SELECT)
3769         const char *fnname = "select()";
3770         int maxfd;
3771         int ctlfd;
3772 #endif
3773         char strbuf[ISC_STRERRORSIZE];
3774 #ifdef ISC_SOCKET_USE_POLLWATCH
3775         pollstate_t pollstate = poll_idle;
3776 #endif
3777
3778 #if defined (USE_SELECT)
3779         /*
3780          * Get the control fd here.  This will never change.
3781          */
3782         ctlfd = manager->pipe_fds[0];
3783 #endif
3784         done = ISC_FALSE;
3785         while (!done) {
3786                 do {
3787 #ifdef USE_KQUEUE
3788                         cc = kevent(manager->kqueue_fd, NULL, 0,
3789                                     manager->events, manager->nevents, NULL);
3790 #elif defined(USE_EPOLL)
3791                         cc = epoll_wait(manager->epoll_fd, manager->events,
3792                                         manager->nevents, -1);
3793 #elif defined(USE_DEVPOLL)
3794                         dvp.dp_fds = manager->events;
3795                         dvp.dp_nfds = manager->nevents;
3796 #ifndef ISC_SOCKET_USE_POLLWATCH
3797                         dvp.dp_timeout = -1;
3798 #else
3799                         if (pollstate == poll_idle)
3800                                 dvp.dp_timeout = -1;
3801                         else
3802                                 dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT;
3803 #endif  /* ISC_SOCKET_USE_POLLWATCH */
3804                         cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
3805 #elif defined(USE_SELECT)
3806                         LOCK(&manager->lock);
3807                         memmove(manager->read_fds_copy, manager->read_fds,
3808                                 manager->fd_bufsize);
3809                         memmove(manager->write_fds_copy, manager->write_fds,
3810                                 manager->fd_bufsize);
3811                         maxfd = manager->maxfd + 1;
3812                         UNLOCK(&manager->lock);
3813
3814                         cc = select(maxfd, manager->read_fds_copy,
3815                                     manager->write_fds_copy, NULL, NULL);
3816 #endif  /* USE_KQUEUE */
3817
3818                         if (cc < 0 && !SOFT_ERROR(errno)) {
3819                                 isc__strerror(errno, strbuf, sizeof(strbuf));
3820                                 FATAL_ERROR(__FILE__, __LINE__,
3821                                             "%s %s: %s", fnname,
3822                                             isc_msgcat_get(isc_msgcat,
3823                                                            ISC_MSGSET_GENERAL,
3824                                                            ISC_MSG_FAILED,
3825                                                            "failed"), strbuf);
3826                         }
3827
3828 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3829                         if (cc == 0) {
3830                                 if (pollstate == poll_active)
3831                                         pollstate = poll_checking;
3832                                 else if (pollstate == poll_checking)
3833                                         pollstate = poll_idle;
3834                         } else if (cc > 0) {
3835                                 if (pollstate == poll_checking) {
3836                                         /*
3837                                          * XXX: We'd like to use a more
3838                                          * verbose log level as it's actually an
3839                                          * unexpected event, but the kernel bug
3840                                          * reportedly happens pretty frequently
3841                                          * (and it can also be a false positive)
3842                                          * so it would be just too noisy.
3843                                          */
3844                                         manager_log(manager,
3845                                                     ISC_LOGCATEGORY_GENERAL,
3846                                                     ISC_LOGMODULE_SOCKET,
3847                                                     ISC_LOG_DEBUG(1),
3848                                                     "unexpected POLL timeout");
3849                                 }
3850                                 pollstate = poll_active;
3851                         }
3852 #endif
3853                 } while (cc < 0);
3854
3855 #if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL)
3856                 done = process_fds(manager, manager->events, cc);
3857 #elif defined(USE_SELECT)
3858                 process_fds(manager, maxfd, manager->read_fds_copy,
3859                             manager->write_fds_copy);
3860
3861                 /*
3862                  * Process reads on internal, control fd.
3863                  */
3864                 if (FD_ISSET(ctlfd, manager->read_fds_copy))
3865                         done = process_ctlfd(manager);
3866 #endif
3867         }
3868
3869         manager_log(manager, TRACE, "%s",
3870                     isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3871                                    ISC_MSG_EXITING, "watcher exiting"));
3872
3873         return ((isc_threadresult_t)0);
3874 }
3875 #endif /* USE_WATCHER_THREAD */
3876
3877 #ifdef BIND9
3878 ISC_SOCKETFUNC_SCOPE void
3879 isc__socketmgr_setreserved(isc_socketmgr_t *manager0, isc_uint32_t reserved) {
3880         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3881
3882         REQUIRE(VALID_MANAGER(manager));
3883
3884         manager->reserved = reserved;
3885 }
3886
3887 ISC_SOCKETFUNC_SCOPE void
3888 isc___socketmgr_maxudp(isc_socketmgr_t *manager0, int maxudp) {
3889         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3890
3891         REQUIRE(VALID_MANAGER(manager));
3892
3893         manager->maxudp = maxudp;
3894 }
3895 #endif  /* BIND9 */
3896
3897 /*
3898  * Create a new socket manager.
3899  */
3900
3901 static isc_result_t
3902 setup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) {
3903         isc_result_t result;
3904 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3905         char strbuf[ISC_STRERRORSIZE];
3906 #endif
3907
3908 #ifdef USE_KQUEUE
3909         manager->nevents = ISC_SOCKET_MAXEVENTS;
3910         manager->events = isc_mem_get(mctx, sizeof(struct kevent) *
3911                                       manager->nevents);
3912         if (manager->events == NULL)
3913                 return (ISC_R_NOMEMORY);
3914         manager->kqueue_fd = kqueue();
3915         if (manager->kqueue_fd == -1) {
3916                 result = isc__errno2result(errno);
3917                 isc__strerror(errno, strbuf, sizeof(strbuf));
3918                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3919                                  "kqueue %s: %s",
3920                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3921                                                 ISC_MSG_FAILED, "failed"),
3922                                  strbuf);
3923                 isc_mem_put(mctx, manager->events,
3924                             sizeof(struct kevent) * manager->nevents);
3925                 return (result);
3926         }
3927
3928 #ifdef USE_WATCHER_THREAD
3929         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3930         if (result != ISC_R_SUCCESS) {
3931                 close(manager->kqueue_fd);
3932                 isc_mem_put(mctx, manager->events,
3933                             sizeof(struct kevent) * manager->nevents);
3934                 return (result);
3935         }
3936 #endif  /* USE_WATCHER_THREAD */
3937 #elif defined(USE_EPOLL)
3938         manager->nevents = ISC_SOCKET_MAXEVENTS;
3939         manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) *
3940                                       manager->nevents);
3941         if (manager->events == NULL)
3942                 return (ISC_R_NOMEMORY);
3943         manager->epoll_fd = epoll_create(manager->nevents);
3944         if (manager->epoll_fd == -1) {
3945                 result = isc__errno2result(errno);
3946                 isc__strerror(errno, strbuf, sizeof(strbuf));
3947                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3948                                  "epoll_create %s: %s",
3949                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3950                                                 ISC_MSG_FAILED, "failed"),
3951                                  strbuf);
3952                 isc_mem_put(mctx, manager->events,
3953                             sizeof(struct epoll_event) * manager->nevents);
3954                 return (result);
3955         }
3956 #ifdef USE_WATCHER_THREAD
3957         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3958         if (result != ISC_R_SUCCESS) {
3959                 close(manager->epoll_fd);
3960                 isc_mem_put(mctx, manager->events,
3961                             sizeof(struct epoll_event) * manager->nevents);
3962                 return (result);
3963         }
3964 #endif  /* USE_WATCHER_THREAD */
3965 #elif defined(USE_DEVPOLL)
3966         /*
3967          * XXXJT: /dev/poll seems to reject large numbers of events,
3968          * so we should be careful about redefining ISC_SOCKET_MAXEVENTS.
3969          */
3970         manager->nevents = ISC_SOCKET_MAXEVENTS;
3971         manager->events = isc_mem_get(mctx, sizeof(struct pollfd) *
3972                                       manager->nevents);
3973         if (manager->events == NULL)
3974                 return (ISC_R_NOMEMORY);
3975         /*
3976          * Note: fdpollinfo should be able to support all possible FDs, so
3977          * it must have maxsocks entries (not nevents).
3978          */
3979         manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) *
3980                                           manager->maxsocks);
3981         if (manager->fdpollinfo == NULL) {
3982                 isc_mem_put(mctx, manager->events,
3983                             sizeof(struct pollfd) * manager->nevents);
3984                 return (ISC_R_NOMEMORY);
3985         }
3986         memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks);
3987         manager->devpoll_fd = open("/dev/poll", O_RDWR);
3988         if (manager->devpoll_fd == -1) {
3989                 result = isc__errno2result(errno);
3990                 isc__strerror(errno, strbuf, sizeof(strbuf));
3991                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3992                                  "open(/dev/poll) %s: %s",
3993                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3994                                                 ISC_MSG_FAILED, "failed"),
3995                                  strbuf);
3996                 isc_mem_put(mctx, manager->events,
3997                             sizeof(struct pollfd) * manager->nevents);
3998                 isc_mem_put(mctx, manager->fdpollinfo,
3999                             sizeof(pollinfo_t) * manager->maxsocks);
4000                 return (result);
4001         }
4002 #ifdef USE_WATCHER_THREAD
4003         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
4004         if (result != ISC_R_SUCCESS) {
4005                 close(manager->devpoll_fd);
4006                 isc_mem_put(mctx, manager->events,
4007                             sizeof(struct pollfd) * manager->nevents);
4008                 isc_mem_put(mctx, manager->fdpollinfo,
4009                             sizeof(pollinfo_t) * manager->maxsocks);
4010                 return (result);
4011         }
4012 #endif  /* USE_WATCHER_THREAD */
4013 #elif defined(USE_SELECT)
4014         UNUSED(result);
4015
4016 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
4017         /*
4018          * Note: this code should also cover the case of MAXSOCKETS <=
4019          * FD_SETSIZE, but we separate the cases to avoid possible portability
4020          * issues regarding howmany() and the actual representation of fd_set.
4021          */
4022         manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
4023                 sizeof(fd_mask);
4024 #else
4025         manager->fd_bufsize = sizeof(fd_set);
4026 #endif
4027
4028         manager->read_fds = NULL;
4029         manager->read_fds_copy = NULL;
4030         manager->write_fds = NULL;
4031         manager->write_fds_copy = NULL;
4032
4033         manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
4034         if (manager->read_fds != NULL)
4035                 manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
4036         if (manager->read_fds_copy != NULL)
4037                 manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
4038         if (manager->write_fds != NULL) {
4039                 manager->write_fds_copy = isc_mem_get(mctx,
4040                                                       manager->fd_bufsize);
4041         }
4042         if (manager->write_fds_copy == NULL) {
4043                 if (manager->write_fds != NULL) {
4044                         isc_mem_put(mctx, manager->write_fds,
4045                                     manager->fd_bufsize);
4046                 }
4047                 if (manager->read_fds_copy != NULL) {
4048                         isc_mem_put(mctx, manager->read_fds_copy,
4049                                     manager->fd_bufsize);
4050                 }
4051                 if (manager->read_fds != NULL) {
4052                         isc_mem_put(mctx, manager->read_fds,
4053                                     manager->fd_bufsize);
4054                 }
4055                 return (ISC_R_NOMEMORY);
4056         }
4057         memset(manager->read_fds, 0, manager->fd_bufsize);
4058         memset(manager->write_fds, 0, manager->fd_bufsize);
4059
4060 #ifdef USE_WATCHER_THREAD
4061         (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
4062         manager->maxfd = manager->pipe_fds[0];
4063 #else /* USE_WATCHER_THREAD */
4064         manager->maxfd = 0;
4065 #endif /* USE_WATCHER_THREAD */
4066 #endif  /* USE_KQUEUE */
4067
4068         return (ISC_R_SUCCESS);
4069 }
4070
4071 static void
4072 cleanup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) {
4073 #ifdef USE_WATCHER_THREAD
4074         isc_result_t result;
4075
4076         result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
4077         if (result != ISC_R_SUCCESS) {
4078                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4079                                  "epoll_ctl(DEL) %s",
4080                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4081                                                 ISC_MSG_FAILED, "failed"));
4082         }
4083 #endif  /* USE_WATCHER_THREAD */
4084
4085 #ifdef USE_KQUEUE
4086         close(manager->kqueue_fd);
4087         isc_mem_put(mctx, manager->events,
4088                     sizeof(struct kevent) * manager->nevents);
4089 #elif defined(USE_EPOLL)
4090         close(manager->epoll_fd);
4091         isc_mem_put(mctx, manager->events,
4092                     sizeof(struct epoll_event) * manager->nevents);
4093 #elif defined(USE_DEVPOLL)
4094         close(manager->devpoll_fd);
4095         isc_mem_put(mctx, manager->events,
4096                     sizeof(struct pollfd) * manager->nevents);
4097         isc_mem_put(mctx, manager->fdpollinfo,
4098                     sizeof(pollinfo_t) * manager->maxsocks);
4099 #elif defined(USE_SELECT)
4100         if (manager->read_fds != NULL)
4101                 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
4102         if (manager->read_fds_copy != NULL)
4103                 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize);
4104         if (manager->write_fds != NULL)
4105                 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
4106         if (manager->write_fds_copy != NULL)
4107                 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
4108 #endif  /* USE_KQUEUE */
4109 }
4110
4111 ISC_SOCKETFUNC_SCOPE isc_result_t
4112 isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
4113         return (isc__socketmgr_create2(mctx, managerp, 0));
4114 }
4115
4116 ISC_SOCKETFUNC_SCOPE isc_result_t
4117 isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
4118                        unsigned int maxsocks)
4119 {
4120         int i;
4121         isc__socketmgr_t *manager;
4122 #ifdef USE_WATCHER_THREAD
4123         char strbuf[ISC_STRERRORSIZE];
4124 #endif
4125         isc_result_t result;
4126
4127         REQUIRE(managerp != NULL && *managerp == NULL);
4128
4129 #ifdef USE_SHARED_MANAGER
4130         if (socketmgr != NULL) {
4131                 /* Don't allow maxsocks to be updated */
4132                 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
4133                         return (ISC_R_EXISTS);
4134
4135                 socketmgr->refs++;
4136                 *managerp = (isc_socketmgr_t *)socketmgr;
4137                 return (ISC_R_SUCCESS);
4138         }
4139 #endif /* USE_SHARED_MANAGER */
4140
4141         if (maxsocks == 0)
4142                 maxsocks = ISC_SOCKET_MAXSOCKETS;
4143
4144         manager = isc_mem_get(mctx, sizeof(*manager));
4145         if (manager == NULL)
4146                 return (ISC_R_NOMEMORY);
4147
4148         /* zero-clear so that necessary cleanup on failure will be easy */
4149         memset(manager, 0, sizeof(*manager));
4150         manager->maxsocks = maxsocks;
4151         manager->reserved = 0;
4152         manager->maxudp = 0;
4153         manager->fds = isc_mem_get(mctx,
4154                                    manager->maxsocks * sizeof(isc__socket_t *));
4155         if (manager->fds == NULL) {
4156                 result = ISC_R_NOMEMORY;
4157                 goto free_manager;
4158         }
4159         manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int));
4160         if (manager->fdstate == NULL) {
4161                 result = ISC_R_NOMEMORY;
4162                 goto free_manager;
4163         }
4164         manager->stats = NULL;
4165
4166         manager->common.methods = &socketmgrmethods;
4167         manager->common.magic = ISCAPI_SOCKETMGR_MAGIC;
4168         manager->common.impmagic = SOCKET_MANAGER_MAGIC;
4169         manager->mctx = NULL;
4170         memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
4171         ISC_LIST_INIT(manager->socklist);
4172         result = isc_mutex_init(&manager->lock);
4173         if (result != ISC_R_SUCCESS)
4174                 goto free_manager;
4175         manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t));
4176         if (manager->fdlock == NULL) {
4177                 result = ISC_R_NOMEMORY;
4178                 goto cleanup_lock;
4179         }
4180         for (i = 0; i < FDLOCK_COUNT; i++) {
4181                 result = isc_mutex_init(&manager->fdlock[i]);
4182                 if (result != ISC_R_SUCCESS) {
4183                         while (--i >= 0)
4184                                 DESTROYLOCK(&manager->fdlock[i]);
4185                         isc_mem_put(mctx, manager->fdlock,
4186                                     FDLOCK_COUNT * sizeof(isc_mutex_t));
4187                         manager->fdlock = NULL;
4188                         goto cleanup_lock;
4189                 }
4190         }
4191
4192 #ifdef USE_WATCHER_THREAD
4193         if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
4194                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4195                                  "isc_condition_init() %s",
4196                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4197                                                 ISC_MSG_FAILED, "failed"));
4198                 result = ISC_R_UNEXPECTED;
4199                 goto cleanup_lock;
4200         }
4201
4202         /*
4203          * Create the special fds that will be used to wake up the
4204          * select/poll loop when something internal needs to be done.
4205          */
4206         if (pipe(manager->pipe_fds) != 0) {
4207                 isc__strerror(errno, strbuf, sizeof(strbuf));
4208                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4209                                  "pipe() %s: %s",
4210                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4211                                                 ISC_MSG_FAILED, "failed"),
4212                                  strbuf);
4213                 result = ISC_R_UNEXPECTED;
4214                 goto cleanup_condition;
4215         }
4216
4217         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
4218 #if 0
4219         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
4220 #endif
4221 #endif  /* USE_WATCHER_THREAD */
4222
4223 #ifdef USE_SHARED_MANAGER
4224         manager->refs = 1;
4225 #endif /* USE_SHARED_MANAGER */
4226
4227         /*
4228          * Set up initial state for the select loop
4229          */
4230         result = setup_watcher(mctx, manager);
4231         if (result != ISC_R_SUCCESS)
4232                 goto cleanup;
4233         memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
4234 #ifdef USE_WATCHER_THREAD
4235         /*
4236          * Start up the select/poll thread.
4237          */
4238         if (isc_thread_create(watcher, manager, &manager->watcher) !=
4239             ISC_R_SUCCESS) {
4240                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4241                                  "isc_thread_create() %s",
4242                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4243                                                 ISC_MSG_FAILED, "failed"));
4244                 cleanup_watcher(mctx, manager);
4245                 result = ISC_R_UNEXPECTED;
4246                 goto cleanup;
4247         }
4248 #endif /* USE_WATCHER_THREAD */
4249         isc_mem_attach(mctx, &manager->mctx);
4250
4251 #ifdef USE_SHARED_MANAGER
4252         socketmgr = manager;
4253 #endif /* USE_SHARED_MANAGER */
4254         *managerp = (isc_socketmgr_t *)manager;
4255
4256         return (ISC_R_SUCCESS);
4257
4258 cleanup:
4259 #ifdef USE_WATCHER_THREAD
4260         (void)close(manager->pipe_fds[0]);
4261         (void)close(manager->pipe_fds[1]);
4262 #endif  /* USE_WATCHER_THREAD */
4263
4264 #ifdef USE_WATCHER_THREAD
4265 cleanup_condition:
4266         (void)isc_condition_destroy(&manager->shutdown_ok);
4267 #endif  /* USE_WATCHER_THREAD */
4268
4269
4270 cleanup_lock:
4271         if (manager->fdlock != NULL) {
4272                 for (i = 0; i < FDLOCK_COUNT; i++)
4273                         DESTROYLOCK(&manager->fdlock[i]);
4274         }
4275         DESTROYLOCK(&manager->lock);
4276
4277 free_manager:
4278         if (manager->fdlock != NULL) {
4279                 isc_mem_put(mctx, manager->fdlock,
4280                             FDLOCK_COUNT * sizeof(isc_mutex_t));
4281         }
4282         if (manager->fdstate != NULL) {
4283                 isc_mem_put(mctx, manager->fdstate,
4284                             manager->maxsocks * sizeof(int));
4285         }
4286         if (manager->fds != NULL) {
4287                 isc_mem_put(mctx, manager->fds,
4288                             manager->maxsocks * sizeof(isc_socket_t *));
4289         }
4290         isc_mem_put(mctx, manager, sizeof(*manager));
4291
4292         return (result);
4293 }
4294
4295 #ifdef BIND9
4296 isc_result_t
4297 isc__socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp) {
4298         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
4299         REQUIRE(VALID_MANAGER(manager));
4300         REQUIRE(nsockp != NULL);
4301
4302         *nsockp = manager->maxsocks;
4303
4304         return (ISC_R_SUCCESS);
4305 }
4306
4307 void
4308 isc__socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats) {
4309         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
4310
4311         REQUIRE(VALID_MANAGER(manager));
4312         REQUIRE(ISC_LIST_EMPTY(manager->socklist));
4313         REQUIRE(manager->stats == NULL);
4314         REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
4315
4316         isc_stats_attach(stats, &manager->stats);
4317 }
4318 #endif
4319
4320 ISC_SOCKETFUNC_SCOPE void
4321 isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
4322         isc__socketmgr_t *manager;
4323         int i;
4324         isc_mem_t *mctx;
4325
4326         /*
4327          * Destroy a socket manager.
4328          */
4329
4330         REQUIRE(managerp != NULL);
4331         manager = (isc__socketmgr_t *)*managerp;
4332         REQUIRE(VALID_MANAGER(manager));
4333
4334 #ifdef USE_SHARED_MANAGER
4335         manager->refs--;
4336         if (manager->refs > 0) {
4337                 *managerp = NULL;
4338                 return;
4339         }
4340         socketmgr = NULL;
4341 #endif /* USE_SHARED_MANAGER */
4342
4343         LOCK(&manager->lock);
4344
4345         /*
4346          * Wait for all sockets to be destroyed.
4347          */
4348         while (!ISC_LIST_EMPTY(manager->socklist)) {
4349 #ifdef USE_WATCHER_THREAD
4350                 manager_log(manager, CREATION, "%s",
4351                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
4352                                            ISC_MSG_SOCKETSREMAIN,
4353                                            "sockets exist"));
4354                 WAIT(&manager->shutdown_ok, &manager->lock);
4355 #else /* USE_WATCHER_THREAD */
4356                 UNLOCK(&manager->lock);
4357                 isc__taskmgr_dispatch(NULL);
4358                 LOCK(&manager->lock);
4359 #endif /* USE_WATCHER_THREAD */
4360         }
4361
4362         UNLOCK(&manager->lock);
4363
4364         /*
4365          * Here, poke our select/poll thread.  Do this by closing the write
4366          * half of the pipe, which will send EOF to the read half.
4367          * This is currently a no-op in the non-threaded case.
4368          */
4369         select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
4370
4371 #ifdef USE_WATCHER_THREAD
4372         /*
4373          * Wait for thread to exit.
4374          */
4375         if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
4376                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4377                                  "isc_thread_join() %s",
4378                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4379                                                 ISC_MSG_FAILED, "failed"));
4380 #endif /* USE_WATCHER_THREAD */
4381
4382         /*
4383          * Clean up.
4384          */
4385         cleanup_watcher(manager->mctx, manager);
4386
4387 #ifdef USE_WATCHER_THREAD
4388         (void)close(manager->pipe_fds[0]);
4389         (void)close(manager->pipe_fds[1]);
4390         (void)isc_condition_destroy(&manager->shutdown_ok);
4391 #endif /* USE_WATCHER_THREAD */
4392
4393         for (i = 0; i < (int)manager->maxsocks; i++)
4394                 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
4395                         (void)close(i);
4396
4397         isc_mem_put(manager->mctx, manager->fds,
4398                     manager->maxsocks * sizeof(isc__socket_t *));
4399         isc_mem_put(manager->mctx, manager->fdstate,
4400                     manager->maxsocks * sizeof(int));
4401
4402         if (manager->stats != NULL)
4403                 isc_stats_detach(&manager->stats);
4404
4405         if (manager->fdlock != NULL) {
4406                 for (i = 0; i < FDLOCK_COUNT; i++)
4407                         DESTROYLOCK(&manager->fdlock[i]);
4408                 isc_mem_put(manager->mctx, manager->fdlock,
4409                             FDLOCK_COUNT * sizeof(isc_mutex_t));
4410         }
4411         DESTROYLOCK(&manager->lock);
4412         manager->common.magic = 0;
4413         manager->common.impmagic = 0;
4414         mctx= manager->mctx;
4415         isc_mem_put(mctx, manager, sizeof(*manager));
4416
4417         isc_mem_detach(&mctx);
4418
4419         *managerp = NULL;
4420
4421 #ifdef USE_SHARED_MANAGER
4422         socketmgr = NULL;
4423 #endif
4424 }
4425
4426 static isc_result_t
4427 socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4428             unsigned int flags)
4429 {
4430         int io_state;
4431         isc_boolean_t have_lock = ISC_FALSE;
4432         isc_task_t *ntask = NULL;
4433         isc_result_t result = ISC_R_SUCCESS;
4434
4435         dev->ev_sender = task;
4436
4437         if (sock->type == isc_sockettype_udp) {
4438                 io_state = doio_recv(sock, dev);
4439         } else {
4440                 LOCK(&sock->lock);
4441                 have_lock = ISC_TRUE;
4442
4443                 if (ISC_LIST_EMPTY(sock->recv_list))
4444                         io_state = doio_recv(sock, dev);
4445                 else
4446                         io_state = DOIO_SOFT;
4447         }
4448
4449         switch (io_state) {
4450         case DOIO_SOFT:
4451                 /*
4452                  * We couldn't read all or part of the request right now, so
4453                  * queue it.
4454                  *
4455                  * Attach to socket and to task
4456                  */
4457                 isc_task_attach(task, &ntask);
4458                 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4459
4460                 if (!have_lock) {
4461                         LOCK(&sock->lock);
4462                         have_lock = ISC_TRUE;
4463                 }
4464
4465                 /*
4466                  * Enqueue the request.  If the socket was previously not being
4467                  * watched, poke the watcher to start paying attention to it.
4468                  */
4469                 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
4470                         select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
4471                 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
4472
4473                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
4474                            "socket_recv: event %p -> task %p",
4475                            dev, ntask);
4476
4477                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4478                         result = ISC_R_INPROGRESS;
4479                 break;
4480
4481         case DOIO_EOF:
4482                 dev->result = ISC_R_EOF;
4483                 /* fallthrough */
4484
4485         case DOIO_HARD:
4486         case DOIO_SUCCESS:
4487                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4488                         send_recvdone_event(sock, &dev);
4489                 break;
4490         }
4491
4492         if (have_lock)
4493                 UNLOCK(&sock->lock);
4494
4495         return (result);
4496 }
4497
4498 ISC_SOCKETFUNC_SCOPE isc_result_t
4499 isc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
4500                   unsigned int minimum, isc_task_t *task,
4501                   isc_taskaction_t action, const void *arg)
4502 {
4503         isc__socket_t *sock = (isc__socket_t *)sock0;
4504         isc_socketevent_t *dev;
4505         isc__socketmgr_t *manager;
4506         unsigned int iocount;
4507         isc_buffer_t *buffer;
4508
4509         REQUIRE(VALID_SOCKET(sock));
4510         REQUIRE(buflist != NULL);
4511         REQUIRE(!ISC_LIST_EMPTY(*buflist));
4512         REQUIRE(task != NULL);
4513         REQUIRE(action != NULL);
4514
4515         manager = sock->manager;
4516         REQUIRE(VALID_MANAGER(manager));
4517
4518         iocount = isc_bufferlist_availablecount(buflist);
4519         REQUIRE(iocount > 0);
4520
4521         INSIST(sock->bound);
4522
4523         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4524         if (dev == NULL)
4525                 return (ISC_R_NOMEMORY);
4526
4527         /*
4528          * UDP sockets are always partial read
4529          */
4530         if (sock->type == isc_sockettype_udp)
4531                 dev->minimum = 1;
4532         else {
4533                 if (minimum == 0)
4534                         dev->minimum = iocount;
4535                 else
4536                         dev->minimum = minimum;
4537         }
4538
4539         /*
4540          * Move each buffer from the passed in list to our internal one.
4541          */
4542         buffer = ISC_LIST_HEAD(*buflist);
4543         while (buffer != NULL) {
4544                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
4545                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4546                 buffer = ISC_LIST_HEAD(*buflist);
4547         }
4548
4549         return (socket_recv(sock, dev, task, 0));
4550 }
4551
4552 ISC_SOCKETFUNC_SCOPE isc_result_t
4553 isc__socket_recv(isc_socket_t *sock0, isc_region_t *region,
4554                  unsigned int minimum, isc_task_t *task,
4555                  isc_taskaction_t action, const void *arg)
4556 {
4557         isc__socket_t *sock = (isc__socket_t *)sock0;
4558         isc_socketevent_t *dev;
4559         isc__socketmgr_t *manager;
4560
4561         REQUIRE(VALID_SOCKET(sock));
4562         REQUIRE(action != NULL);
4563
4564         manager = sock->manager;
4565         REQUIRE(VALID_MANAGER(manager));
4566
4567         INSIST(sock->bound);
4568
4569         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4570         if (dev == NULL)
4571                 return (ISC_R_NOMEMORY);
4572
4573         return (isc__socket_recv2(sock0, region, minimum, task, dev, 0));
4574 }
4575
4576 ISC_SOCKETFUNC_SCOPE isc_result_t
4577 isc__socket_recv2(isc_socket_t *sock0, isc_region_t *region,
4578                   unsigned int minimum, isc_task_t *task,
4579                   isc_socketevent_t *event, unsigned int flags)
4580 {
4581         isc__socket_t *sock = (isc__socket_t *)sock0;
4582
4583         event->ev_sender = sock;
4584         event->result = ISC_R_UNSET;
4585         ISC_LIST_INIT(event->bufferlist);
4586         event->region = *region;
4587         event->n = 0;
4588         event->offset = 0;
4589         event->attributes = 0;
4590
4591         /*
4592          * UDP sockets are always partial read.
4593          */
4594         if (sock->type == isc_sockettype_udp)
4595                 event->minimum = 1;
4596         else {
4597                 if (minimum == 0)
4598                         event->minimum = region->length;
4599                 else
4600                         event->minimum = minimum;
4601         }
4602
4603         return (socket_recv(sock, event, task, flags));
4604 }
4605
4606 static isc_result_t
4607 socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4608             isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4609             unsigned int flags)
4610 {
4611         int io_state;
4612         isc_boolean_t have_lock = ISC_FALSE;
4613         isc_task_t *ntask = NULL;
4614         isc_result_t result = ISC_R_SUCCESS;
4615
4616         dev->ev_sender = task;
4617
4618         set_dev_address(address, sock, dev);
4619         if (pktinfo != NULL) {
4620                 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4621                 dev->pktinfo = *pktinfo;
4622
4623                 if (!isc_sockaddr_issitelocal(&dev->address) &&
4624                     !isc_sockaddr_islinklocal(&dev->address)) {
4625                         socket_log(sock, NULL, TRACE, isc_msgcat,
4626                                    ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
4627                                    "pktinfo structure provided, ifindex %u "
4628                                    "(set to 0)", pktinfo->ipi6_ifindex);
4629
4630                         /*
4631                          * Set the pktinfo index to 0 here, to let the
4632                          * kernel decide what interface it should send on.
4633                          */
4634                         dev->pktinfo.ipi6_ifindex = 0;
4635                 }
4636         }
4637
4638         if (sock->type == isc_sockettype_udp)
4639                 io_state = doio_send(sock, dev);
4640         else {
4641                 LOCK(&sock->lock);
4642                 have_lock = ISC_TRUE;
4643
4644                 if (ISC_LIST_EMPTY(sock->send_list))
4645                         io_state = doio_send(sock, dev);
4646                 else
4647                         io_state = DOIO_SOFT;
4648         }
4649
4650         switch (io_state) {
4651         case DOIO_SOFT:
4652                 /*
4653                  * We couldn't send all or part of the request right now, so
4654                  * queue it unless ISC_SOCKFLAG_NORETRY is set.
4655                  */
4656                 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4657                         isc_task_attach(task, &ntask);
4658                         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4659
4660                         if (!have_lock) {
4661                                 LOCK(&sock->lock);
4662                                 have_lock = ISC_TRUE;
4663                         }
4664
4665                         /*
4666                          * Enqueue the request.  If the socket was previously
4667                          * not being watched, poke the watcher to start
4668                          * paying attention to it.
4669                          */
4670                         if (ISC_LIST_EMPTY(sock->send_list) &&
4671                             !sock->pending_send)
4672                                 select_poke(sock->manager, sock->fd,
4673                                             SELECT_POKE_WRITE);
4674                         ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4675
4676                         socket_log(sock, NULL, EVENT, NULL, 0, 0,
4677                                    "socket_send: event %p -> task %p",
4678                                    dev, ntask);
4679
4680                         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4681                                 result = ISC_R_INPROGRESS;
4682                         break;
4683                 }
4684
4685         case DOIO_HARD:
4686         case DOIO_SUCCESS:
4687                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4688                         send_senddone_event(sock, &dev);
4689                 break;
4690         }
4691
4692         if (have_lock)
4693                 UNLOCK(&sock->lock);
4694
4695         return (result);
4696 }
4697
4698 ISC_SOCKETFUNC_SCOPE isc_result_t
4699 isc__socket_send(isc_socket_t *sock, isc_region_t *region,
4700                  isc_task_t *task, isc_taskaction_t action, const void *arg)
4701 {
4702         /*
4703          * REQUIRE() checking is performed in isc_socket_sendto().
4704          */
4705         return (isc__socket_sendto(sock, region, task, action, arg, NULL,
4706                                    NULL));
4707 }
4708
4709 ISC_SOCKETFUNC_SCOPE isc_result_t
4710 isc__socket_sendto(isc_socket_t *sock0, isc_region_t *region,
4711                    isc_task_t *task, isc_taskaction_t action, const void *arg,
4712                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4713 {
4714         isc__socket_t *sock = (isc__socket_t *)sock0;
4715         isc_socketevent_t *dev;
4716         isc__socketmgr_t *manager;
4717
4718         REQUIRE(VALID_SOCKET(sock));
4719         REQUIRE(region != NULL);
4720         REQUIRE(task != NULL);
4721         REQUIRE(action != NULL);
4722
4723         manager = sock->manager;
4724         REQUIRE(VALID_MANAGER(manager));
4725
4726         INSIST(sock->bound);
4727
4728         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4729         if (dev == NULL)
4730                 return (ISC_R_NOMEMORY);
4731
4732         dev->region = *region;
4733
4734         return (socket_send(sock, dev, task, address, pktinfo, 0));
4735 }
4736
4737 ISC_SOCKETFUNC_SCOPE isc_result_t
4738 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
4739                   isc_task_t *task, isc_taskaction_t action, const void *arg)
4740 {
4741         return (isc__socket_sendtov2(sock, buflist, task, action, arg, NULL,
4742                                      NULL, 0));
4743 }
4744
4745 ISC_SOCKETFUNC_SCOPE isc_result_t
4746 isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
4747                     isc_task_t *task, isc_taskaction_t action, const void *arg,
4748                     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4749 {
4750         return (isc__socket_sendtov2(sock, buflist, task, action, arg, address,
4751                                      pktinfo, 0));
4752 }
4753
4754 ISC_SOCKETFUNC_SCOPE isc_result_t
4755 isc__socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist,
4756                      isc_task_t *task, isc_taskaction_t action, const void *arg,
4757                      isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4758                      unsigned int flags)
4759 {
4760         isc__socket_t *sock = (isc__socket_t *)sock0;
4761         isc_socketevent_t *dev;
4762         isc__socketmgr_t *manager;
4763         unsigned int iocount;
4764         isc_buffer_t *buffer;
4765
4766         REQUIRE(VALID_SOCKET(sock));
4767         REQUIRE(buflist != NULL);
4768         REQUIRE(!ISC_LIST_EMPTY(*buflist));
4769         REQUIRE(task != NULL);
4770         REQUIRE(action != NULL);
4771
4772         manager = sock->manager;
4773         REQUIRE(VALID_MANAGER(manager));
4774
4775         iocount = isc_bufferlist_usedcount(buflist);
4776         REQUIRE(iocount > 0);
4777
4778         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4779         if (dev == NULL)
4780                 return (ISC_R_NOMEMORY);
4781
4782         /*
4783          * Move each buffer from the passed in list to our internal one.
4784          */
4785         buffer = ISC_LIST_HEAD(*buflist);
4786         while (buffer != NULL) {
4787                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
4788                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4789                 buffer = ISC_LIST_HEAD(*buflist);
4790         }
4791
4792         return (socket_send(sock, dev, task, address, pktinfo, flags));
4793 }
4794
4795 ISC_SOCKETFUNC_SCOPE isc_result_t
4796 isc__socket_sendto2(isc_socket_t *sock0, isc_region_t *region,
4797                     isc_task_t *task,
4798                     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4799                     isc_socketevent_t *event, unsigned int flags)
4800 {
4801         isc__socket_t *sock = (isc__socket_t *)sock0;
4802
4803         REQUIRE(VALID_SOCKET(sock));
4804         REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
4805         if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
4806                 REQUIRE(sock->type == isc_sockettype_udp);
4807         event->ev_sender = sock;
4808         event->result = ISC_R_UNSET;
4809         ISC_LIST_INIT(event->bufferlist);
4810         event->region = *region;
4811         event->n = 0;
4812         event->offset = 0;
4813         event->attributes = 0;
4814
4815         return (socket_send(sock, event, task, address, pktinfo, flags));
4816 }
4817
4818 ISC_SOCKETFUNC_SCOPE void
4819 isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
4820 #ifdef ISC_PLATFORM_HAVESYSUNH
4821         int s;
4822         struct stat sb;
4823         char strbuf[ISC_STRERRORSIZE];
4824
4825         if (sockaddr->type.sa.sa_family != AF_UNIX)
4826                 return;
4827
4828 #ifndef S_ISSOCK
4829 #if defined(S_IFMT) && defined(S_IFSOCK)
4830 #define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
4831 #elif defined(_S_IFMT) && defined(S_IFSOCK)
4832 #define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
4833 #endif
4834 #endif
4835
4836 #ifndef S_ISFIFO
4837 #if defined(S_IFMT) && defined(S_IFIFO)
4838 #define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
4839 #elif defined(_S_IFMT) && defined(S_IFIFO)
4840 #define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
4841 #endif
4842 #endif
4843
4844 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4845 #error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4846 #endif
4847
4848 #ifndef S_ISFIFO
4849 #define S_ISFIFO(mode) 0
4850 #endif
4851
4852 #ifndef S_ISSOCK
4853 #define S_ISSOCK(mode) 0
4854 #endif
4855
4856         if (active) {
4857                 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4858                         isc__strerror(errno, strbuf, sizeof(strbuf));
4859                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4860                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4861                                       "isc_socket_cleanunix: stat(%s): %s",
4862                                       sockaddr->type.sunix.sun_path, strbuf);
4863                         return;
4864                 }
4865                 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4866                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4867                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4868                                       "isc_socket_cleanunix: %s: not a socket",
4869                                       sockaddr->type.sunix.sun_path);
4870                         return;
4871                 }
4872                 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4873                         isc__strerror(errno, strbuf, sizeof(strbuf));
4874                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4875                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4876                                       "isc_socket_cleanunix: unlink(%s): %s",
4877                                       sockaddr->type.sunix.sun_path, strbuf);
4878                 }
4879                 return;
4880         }
4881
4882         s = socket(AF_UNIX, SOCK_STREAM, 0);
4883         if (s < 0) {
4884                 isc__strerror(errno, strbuf, sizeof(strbuf));
4885                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4886                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4887                               "isc_socket_cleanunix: socket(%s): %s",
4888                               sockaddr->type.sunix.sun_path, strbuf);
4889                 return;
4890         }
4891
4892         if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4893                 switch (errno) {
4894                 case ENOENT:    /* We exited cleanly last time */
4895                         break;
4896                 default:
4897                         isc__strerror(errno, strbuf, sizeof(strbuf));
4898                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4899                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4900                                       "isc_socket_cleanunix: stat(%s): %s",
4901                                       sockaddr->type.sunix.sun_path, strbuf);
4902                         break;
4903                 }
4904                 goto cleanup;
4905         }
4906
4907         if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4908                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4909                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4910                               "isc_socket_cleanunix: %s: not a socket",
4911                               sockaddr->type.sunix.sun_path);
4912                 goto cleanup;
4913         }
4914
4915         if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
4916                     sizeof(sockaddr->type.sunix)) < 0) {
4917                 switch (errno) {
4918                 case ECONNREFUSED:
4919                 case ECONNRESET:
4920                         if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4921                                 isc__strerror(errno, strbuf, sizeof(strbuf));
4922                                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4923                                               ISC_LOGMODULE_SOCKET,
4924                                               ISC_LOG_WARNING,
4925                                               "isc_socket_cleanunix: "
4926                                               "unlink(%s): %s",
4927                                               sockaddr->type.sunix.sun_path,
4928                                               strbuf);
4929                         }
4930                         break;
4931                 default:
4932                         isc__strerror(errno, strbuf, sizeof(strbuf));
4933                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4934                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4935                                       "isc_socket_cleanunix: connect(%s): %s",
4936                                       sockaddr->type.sunix.sun_path, strbuf);
4937                         break;
4938                 }
4939         }
4940  cleanup:
4941         close(s);
4942 #else
4943         UNUSED(sockaddr);
4944         UNUSED(active);
4945 #endif
4946 }
4947
4948 ISC_SOCKETFUNC_SCOPE isc_result_t
4949 isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
4950                     isc_uint32_t owner, isc_uint32_t group)
4951 {
4952 #ifdef ISC_PLATFORM_HAVESYSUNH
4953         isc_result_t result = ISC_R_SUCCESS;
4954         char strbuf[ISC_STRERRORSIZE];
4955         char path[sizeof(sockaddr->type.sunix.sun_path)];
4956 #ifdef NEED_SECURE_DIRECTORY
4957         char *slash;
4958 #endif
4959
4960         REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4961         INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4962         strcpy(path, sockaddr->type.sunix.sun_path);
4963
4964 #ifdef NEED_SECURE_DIRECTORY
4965         slash = strrchr(path, '/');
4966         if (slash != NULL) {
4967                 if (slash != path)
4968                         *slash = '\0';
4969                 else
4970                         strcpy(path, "/");
4971         } else
4972                 strcpy(path, ".");
4973 #endif
4974
4975         if (chmod(path, perm) < 0) {
4976                 isc__strerror(errno, strbuf, sizeof(strbuf));
4977                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4978                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4979                               "isc_socket_permunix: chmod(%s, %d): %s",
4980                               path, perm, strbuf);
4981                 result = ISC_R_FAILURE;
4982         }
4983         if (chown(path, owner, group) < 0) {
4984                 isc__strerror(errno, strbuf, sizeof(strbuf));
4985                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4986                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4987                               "isc_socket_permunix: chown(%s, %d, %d): %s",
4988                               path, owner, group,
4989                               strbuf);
4990                 result = ISC_R_FAILURE;
4991         }
4992         return (result);
4993 #else
4994         UNUSED(sockaddr);
4995         UNUSED(perm);
4996         UNUSED(owner);
4997         UNUSED(group);
4998         return (ISC_R_NOTIMPLEMENTED);
4999 #endif
5000 }
5001
5002 ISC_SOCKETFUNC_SCOPE isc_result_t
5003 isc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
5004                  unsigned int options) {
5005         isc__socket_t *sock = (isc__socket_t *)sock0;
5006         char strbuf[ISC_STRERRORSIZE];
5007         int on = 1;
5008
5009         REQUIRE(VALID_SOCKET(sock));
5010
5011         LOCK(&sock->lock);
5012
5013         INSIST(!sock->bound);
5014
5015         if (sock->pf != sockaddr->type.sa.sa_family) {
5016                 UNLOCK(&sock->lock);
5017                 return (ISC_R_FAMILYMISMATCH);
5018         }
5019         /*
5020          * Only set SO_REUSEADDR when we want a specific port.
5021          */
5022 #ifdef AF_UNIX
5023         if (sock->pf == AF_UNIX)
5024                 goto bind_socket;
5025 #endif
5026         if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
5027             isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
5028             setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
5029                        sizeof(on)) < 0) {
5030                 UNEXPECTED_ERROR(__FILE__, __LINE__,
5031                                  "setsockopt(%d) %s", sock->fd,
5032                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
5033                                                 ISC_MSG_FAILED, "failed"));
5034                 /* Press on... */
5035         }
5036 #ifdef AF_UNIX
5037  bind_socket:
5038 #endif
5039         if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
5040                 inc_stats(sock->manager->stats,
5041                           sock->statsindex[STATID_BINDFAIL]);
5042
5043                 UNLOCK(&sock->lock);
5044                 switch (errno) {
5045                 case EACCES:
5046                         return (ISC_R_NOPERM);
5047                 case EADDRNOTAVAIL:
5048                         return (ISC_R_ADDRNOTAVAIL);
5049                 case EADDRINUSE:
5050                         return (ISC_R_ADDRINUSE);
5051                 case EINVAL:
5052                         return (ISC_R_BOUND);
5053                 default:
5054                         isc__strerror(errno, strbuf, sizeof(strbuf));
5055                         UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
5056                                          strbuf);
5057                         return (ISC_R_UNEXPECTED);
5058                 }
5059         }
5060
5061         socket_log(sock, sockaddr, TRACE,
5062                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
5063         sock->bound = 1;
5064
5065         UNLOCK(&sock->lock);
5066         return (ISC_R_SUCCESS);
5067 }
5068
5069 /*
5070  * Enable this only for specific OS versions, and only when they have repaired
5071  * their problems with it.  Until then, this is is broken and needs to be
5072  * diabled by default.  See RT22589 for details.
5073  */
5074 #undef ENABLE_ACCEPTFILTER
5075
5076 ISC_SOCKETFUNC_SCOPE isc_result_t
5077 isc__socket_filter(isc_socket_t *sock0, const char *filter) {
5078         isc__socket_t *sock = (isc__socket_t *)sock0;
5079 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
5080         char strbuf[ISC_STRERRORSIZE];
5081         struct accept_filter_arg afa;
5082 #else
5083         UNUSED(sock);
5084         UNUSED(filter);
5085 #endif
5086
5087         REQUIRE(VALID_SOCKET(sock));
5088
5089 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
5090         bzero(&afa, sizeof(afa));
5091         strncpy(afa.af_name, filter, sizeof(afa.af_name));
5092         if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
5093                          &afa, sizeof(afa)) == -1) {
5094                 isc__strerror(errno, strbuf, sizeof(strbuf));
5095                 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
5096                            ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
5097                            strbuf);
5098                 return (ISC_R_FAILURE);
5099         }
5100         return (ISC_R_SUCCESS);
5101 #else
5102         return (ISC_R_NOTIMPLEMENTED);
5103 #endif
5104 }
5105
5106 /*
5107  * Set up to listen on a given socket.  We do this by creating an internal
5108  * event that will be dispatched when the socket has read activity.  The
5109  * watcher will send the internal event to the task when there is a new
5110  * connection.
5111  *
5112  * Unlike in read, we don't preallocate a done event here.  Every time there
5113  * is a new connection we'll have to allocate a new one anyway, so we might
5114  * as well keep things simple rather than having to track them.
5115  */
5116 ISC_SOCKETFUNC_SCOPE isc_result_t
5117 isc__socket_listen(isc_socket_t *sock0, unsigned int backlog) {
5118         isc__socket_t *sock = (isc__socket_t *)sock0;
5119         char strbuf[ISC_STRERRORSIZE];
5120
5121         REQUIRE(VALID_SOCKET(sock));
5122
5123         LOCK(&sock->lock);
5124
5125         REQUIRE(!sock->listener);
5126         REQUIRE(sock->bound);
5127         REQUIRE(sock->type == isc_sockettype_tcp ||
5128                 sock->type == isc_sockettype_unix);
5129
5130         if (backlog == 0)
5131                 backlog = SOMAXCONN;
5132
5133         if (listen(sock->fd, (int)backlog) < 0) {
5134                 UNLOCK(&sock->lock);
5135                 isc__strerror(errno, strbuf, sizeof(strbuf));
5136
5137                 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
5138
5139                 return (ISC_R_UNEXPECTED);
5140         }
5141
5142         sock->listener = 1;
5143
5144         UNLOCK(&sock->lock);
5145         return (ISC_R_SUCCESS);
5146 }
5147
5148 /*
5149  * This should try to do aggressive accept() XXXMLG
5150  */
5151 ISC_SOCKETFUNC_SCOPE isc_result_t
5152 isc__socket_accept(isc_socket_t *sock0,
5153                   isc_task_t *task, isc_taskaction_t action, const void *arg)
5154 {
5155         isc__socket_t *sock = (isc__socket_t *)sock0;
5156         isc_socket_newconnev_t *dev;
5157         isc__socketmgr_t *manager;
5158         isc_task_t *ntask = NULL;
5159         isc__socket_t *nsock;
5160         isc_result_t result;
5161         isc_boolean_t do_poke = ISC_FALSE;
5162
5163         REQUIRE(VALID_SOCKET(sock));
5164         manager = sock->manager;
5165         REQUIRE(VALID_MANAGER(manager));
5166
5167         LOCK(&sock->lock);
5168
5169         REQUIRE(sock->listener);
5170
5171         /*
5172          * Sender field is overloaded here with the task we will be sending
5173          * this event to.  Just before the actual event is delivered the
5174          * actual ev_sender will be touched up to be the socket.
5175          */
5176         dev = (isc_socket_newconnev_t *)
5177                 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
5178                                    action, arg, sizeof(*dev));
5179         if (dev == NULL) {
5180                 UNLOCK(&sock->lock);
5181                 return (ISC_R_NOMEMORY);
5182         }
5183         ISC_LINK_INIT(dev, ev_link);
5184
5185         result = allocate_socket(manager, sock->type, &nsock);
5186         if (result != ISC_R_SUCCESS) {
5187                 isc_event_free(ISC_EVENT_PTR(&dev));
5188                 UNLOCK(&sock->lock);
5189                 return (result);
5190         }
5191
5192         /*
5193          * Attach to socket and to task.
5194          */
5195         isc_task_attach(task, &ntask);
5196         if (isc_task_exiting(ntask)) {
5197                 free_socket(&nsock);
5198                 isc_task_detach(&ntask);
5199                 isc_event_free(ISC_EVENT_PTR(&dev));
5200                 UNLOCK(&sock->lock);
5201                 return (ISC_R_SHUTTINGDOWN);
5202         }
5203         nsock->references++;
5204         nsock->statsindex = sock->statsindex;
5205
5206         dev->ev_sender = ntask;
5207         dev->newsocket = (isc_socket_t *)nsock;
5208
5209         /*
5210          * Poke watcher here.  We still have the socket locked, so there
5211          * is no race condition.  We will keep the lock for such a short
5212          * bit of time waking it up now or later won't matter all that much.
5213          */
5214         if (ISC_LIST_EMPTY(sock->accept_list))
5215                 do_poke = ISC_TRUE;
5216
5217         ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
5218
5219         if (do_poke)
5220                 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
5221
5222         UNLOCK(&sock->lock);
5223         return (ISC_R_SUCCESS);
5224 }
5225
5226 ISC_SOCKETFUNC_SCOPE isc_result_t
5227 isc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
5228                    isc_task_t *task, isc_taskaction_t action, const void *arg)
5229 {
5230         isc__socket_t *sock = (isc__socket_t *)sock0;
5231         isc_socket_connev_t *dev;
5232         isc_task_t *ntask = NULL;
5233         isc__socketmgr_t *manager;
5234         int cc;
5235         char strbuf[ISC_STRERRORSIZE];
5236         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
5237
5238         REQUIRE(VALID_SOCKET(sock));
5239         REQUIRE(addr != NULL);
5240         REQUIRE(task != NULL);
5241         REQUIRE(action != NULL);
5242
5243         manager = sock->manager;
5244         REQUIRE(VALID_MANAGER(manager));
5245         REQUIRE(addr != NULL);
5246
5247         if (isc_sockaddr_ismulticast(addr))
5248                 return (ISC_R_MULTICAST);
5249
5250         LOCK(&sock->lock);
5251
5252         REQUIRE(!sock->connecting);
5253
5254         dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
5255                                                         ISC_SOCKEVENT_CONNECT,
5256                                                         action, arg,
5257                                                         sizeof(*dev));
5258         if (dev == NULL) {
5259                 UNLOCK(&sock->lock);
5260                 return (ISC_R_NOMEMORY);
5261         }
5262         ISC_LINK_INIT(dev, ev_link);
5263
5264         /*
5265          * Try to do the connect right away, as there can be only one
5266          * outstanding, and it might happen to complete.
5267          */
5268         sock->peer_address = *addr;
5269         cc = connect(sock->fd, &addr->type.sa, addr->length);
5270         if (cc < 0) {
5271                 /*
5272                  * HP-UX "fails" to connect a UDP socket and sets errno to
5273                  * EINPROGRESS if it's non-blocking.  We'd rather regard this as
5274                  * a success and let the user detect it if it's really an error
5275                  * at the time of sending a packet on the socket.
5276                  */
5277                 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
5278                         cc = 0;
5279                         goto success;
5280                 }
5281                 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
5282                         goto queue;
5283
5284                 switch (errno) {
5285 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
5286                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
5287                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
5288                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
5289                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
5290                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
5291 #ifdef EHOSTDOWN
5292                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
5293 #endif
5294                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
5295                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
5296                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
5297                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
5298                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5299 #undef ERROR_MATCH
5300                 }
5301
5302                 sock->connected = 0;
5303
5304                 isc__strerror(errno, strbuf, sizeof(strbuf));
5305                 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
5306                 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
5307                                  addrbuf, errno, strbuf);
5308
5309                 UNLOCK(&sock->lock);
5310                 inc_stats(sock->manager->stats,
5311                           sock->statsindex[STATID_CONNECTFAIL]);
5312                 isc_event_free(ISC_EVENT_PTR(&dev));
5313                 return (ISC_R_UNEXPECTED);
5314
5315         err_exit:
5316                 sock->connected = 0;
5317                 isc_task_send(task, ISC_EVENT_PTR(&dev));
5318
5319                 UNLOCK(&sock->lock);
5320                 inc_stats(sock->manager->stats,
5321                           sock->statsindex[STATID_CONNECTFAIL]);
5322                 return (ISC_R_SUCCESS);
5323         }
5324
5325         /*
5326          * If connect completed, fire off the done event.
5327          */
5328  success:
5329         if (cc == 0) {
5330                 sock->connected = 1;
5331                 sock->bound = 1;
5332                 dev->result = ISC_R_SUCCESS;
5333                 isc_task_send(task, ISC_EVENT_PTR(&dev));
5334
5335                 UNLOCK(&sock->lock);
5336
5337                 inc_stats(sock->manager->stats,
5338                           sock->statsindex[STATID_CONNECT]);
5339
5340                 return (ISC_R_SUCCESS);
5341         }
5342
5343  queue:
5344
5345         /*
5346          * Attach to task.
5347          */
5348         isc_task_attach(task, &ntask);
5349
5350         sock->connecting = 1;
5351
5352         dev->ev_sender = ntask;
5353
5354         /*
5355          * Poke watcher here.  We still have the socket locked, so there
5356          * is no race condition.  We will keep the lock for such a short
5357          * bit of time waking it up now or later won't matter all that much.
5358          */
5359         if (sock->connect_ev == NULL)
5360                 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
5361
5362         sock->connect_ev = dev;
5363
5364         UNLOCK(&sock->lock);
5365         return (ISC_R_SUCCESS);
5366 }
5367
5368 /*
5369  * Called when a socket with a pending connect() finishes.
5370  */
5371 static void
5372 internal_connect(isc_task_t *me, isc_event_t *ev) {
5373         isc__socket_t *sock;
5374         isc_socket_connev_t *dev;
5375         isc_task_t *task;
5376         int cc;
5377         ISC_SOCKADDR_LEN_T optlen;
5378         char strbuf[ISC_STRERRORSIZE];
5379         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5380
5381         UNUSED(me);
5382         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
5383
5384         sock = ev->ev_sender;
5385         INSIST(VALID_SOCKET(sock));
5386
5387         LOCK(&sock->lock);
5388
5389         /*
5390          * When the internal event was sent the reference count was bumped
5391          * to keep the socket around for us.  Decrement the count here.
5392          */
5393         INSIST(sock->references > 0);
5394         sock->references--;
5395         if (sock->references == 0) {
5396                 UNLOCK(&sock->lock);
5397                 destroy(&sock);
5398                 return;
5399         }
5400
5401         /*
5402          * Has this event been canceled?
5403          */
5404         dev = sock->connect_ev;
5405         if (dev == NULL) {
5406                 INSIST(!sock->connecting);
5407                 UNLOCK(&sock->lock);
5408                 return;
5409         }
5410
5411         INSIST(sock->connecting);
5412         sock->connecting = 0;
5413
5414         /*
5415          * Get any possible error status here.
5416          */
5417         optlen = sizeof(cc);
5418         if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
5419                        (void *)&cc, (void *)&optlen) < 0)
5420                 cc = errno;
5421         else
5422                 errno = cc;
5423
5424         if (errno != 0) {
5425                 /*
5426                  * If the error is EAGAIN, just re-select on this
5427                  * fd and pretend nothing strange happened.
5428                  */
5429                 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
5430                         sock->connecting = 1;
5431                         select_poke(sock->manager, sock->fd,
5432                                     SELECT_POKE_CONNECT);
5433                         UNLOCK(&sock->lock);
5434
5435                         return;
5436                 }
5437
5438                 inc_stats(sock->manager->stats,
5439                           sock->statsindex[STATID_CONNECTFAIL]);
5440
5441                 /*
5442                  * Translate other errors into ISC_R_* flavors.
5443                  */
5444                 switch (errno) {
5445 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
5446                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
5447                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
5448                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
5449                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
5450                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
5451 #ifdef EHOSTDOWN
5452                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
5453 #endif
5454                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
5455                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
5456                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
5457                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
5458                         ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
5459                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5460 #undef ERROR_MATCH
5461                 default:
5462                         dev->result = ISC_R_UNEXPECTED;
5463                         isc_sockaddr_format(&sock->peer_address, peerbuf,
5464                                             sizeof(peerbuf));
5465                         isc__strerror(errno, strbuf, sizeof(strbuf));
5466                         UNEXPECTED_ERROR(__FILE__, __LINE__,
5467                                          "internal_connect: connect(%s) %s",
5468                                          peerbuf, strbuf);
5469                 }
5470         } else {
5471                 inc_stats(sock->manager->stats,
5472                           sock->statsindex[STATID_CONNECT]);
5473                 dev->result = ISC_R_SUCCESS;
5474                 sock->connected = 1;
5475                 sock->bound = 1;
5476         }
5477
5478         sock->connect_ev = NULL;
5479
5480         UNLOCK(&sock->lock);
5481
5482         task = dev->ev_sender;
5483         dev->ev_sender = sock;
5484         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
5485 }
5486
5487 ISC_SOCKETFUNC_SCOPE isc_result_t
5488 isc__socket_getpeername(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
5489         isc__socket_t *sock = (isc__socket_t *)sock0;
5490         isc_result_t result;
5491
5492         REQUIRE(VALID_SOCKET(sock));
5493         REQUIRE(addressp != NULL);
5494
5495         LOCK(&sock->lock);
5496
5497         if (sock->connected) {
5498                 *addressp = sock->peer_address;
5499                 result = ISC_R_SUCCESS;
5500         } else {
5501                 result = ISC_R_NOTCONNECTED;
5502         }
5503
5504         UNLOCK(&sock->lock);
5505
5506         return (result);
5507 }
5508
5509 ISC_SOCKETFUNC_SCOPE isc_result_t
5510 isc__socket_getsockname(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
5511         isc__socket_t *sock = (isc__socket_t *)sock0;
5512         ISC_SOCKADDR_LEN_T len;
5513         isc_result_t result;
5514         char strbuf[ISC_STRERRORSIZE];
5515
5516         REQUIRE(VALID_SOCKET(sock));
5517         REQUIRE(addressp != NULL);
5518
5519         LOCK(&sock->lock);
5520
5521         if (!sock->bound) {
5522                 result = ISC_R_NOTBOUND;
5523                 goto out;
5524         }
5525
5526         result = ISC_R_SUCCESS;
5527
5528         len = sizeof(addressp->type);
5529         if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
5530                 isc__strerror(errno, strbuf, sizeof(strbuf));
5531                 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
5532                                  strbuf);
5533                 result = ISC_R_UNEXPECTED;
5534                 goto out;
5535         }
5536         addressp->length = (unsigned int)len;
5537
5538  out:
5539         UNLOCK(&sock->lock);
5540
5541         return (result);
5542 }
5543
5544 /*
5545  * Run through the list of events on this socket, and cancel the ones
5546  * queued for task "task" of type "how".  "how" is a bitmask.
5547  */
5548 ISC_SOCKETFUNC_SCOPE void
5549 isc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
5550         isc__socket_t *sock = (isc__socket_t *)sock0;
5551
5552         REQUIRE(VALID_SOCKET(sock));
5553
5554         /*
5555          * Quick exit if there is nothing to do.  Don't even bother locking
5556          * in this case.
5557          */
5558         if (how == 0)
5559                 return;
5560
5561         LOCK(&sock->lock);
5562
5563         /*
5564          * All of these do the same thing, more or less.
5565          * Each will:
5566          *      o If the internal event is marked as "posted" try to
5567          *        remove it from the task's queue.  If this fails, mark it
5568          *        as canceled instead, and let the task clean it up later.
5569          *      o For each I/O request for that task of that type, post
5570          *        its done event with status of "ISC_R_CANCELED".
5571          *      o Reset any state needed.
5572          */
5573         if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
5574             && !ISC_LIST_EMPTY(sock->recv_list)) {
5575                 isc_socketevent_t      *dev;
5576                 isc_socketevent_t      *next;
5577                 isc_task_t             *current_task;
5578
5579                 dev = ISC_LIST_HEAD(sock->recv_list);
5580
5581                 while (dev != NULL) {
5582                         current_task = dev->ev_sender;
5583                         next = ISC_LIST_NEXT(dev, ev_link);
5584
5585                         if ((task == NULL) || (task == current_task)) {
5586                                 dev->result = ISC_R_CANCELED;
5587                                 send_recvdone_event(sock, &dev);
5588                         }
5589                         dev = next;
5590                 }
5591         }
5592
5593         if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
5594             && !ISC_LIST_EMPTY(sock->send_list)) {
5595                 isc_socketevent_t      *dev;
5596                 isc_socketevent_t      *next;
5597                 isc_task_t             *current_task;
5598
5599                 dev = ISC_LIST_HEAD(sock->send_list);
5600
5601                 while (dev != NULL) {
5602                         current_task = dev->ev_sender;
5603                         next = ISC_LIST_NEXT(dev, ev_link);
5604
5605                         if ((task == NULL) || (task == current_task)) {
5606                                 dev->result = ISC_R_CANCELED;
5607                                 send_senddone_event(sock, &dev);
5608                         }
5609                         dev = next;
5610                 }
5611         }
5612
5613         if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
5614             && !ISC_LIST_EMPTY(sock->accept_list)) {
5615                 isc_socket_newconnev_t *dev;
5616                 isc_socket_newconnev_t *next;
5617                 isc_task_t             *current_task;
5618
5619                 dev = ISC_LIST_HEAD(sock->accept_list);
5620                 while (dev != NULL) {
5621                         current_task = dev->ev_sender;
5622                         next = ISC_LIST_NEXT(dev, ev_link);
5623
5624                         if ((task == NULL) || (task == current_task)) {
5625
5626                                 ISC_LIST_UNLINK(sock->accept_list, dev,
5627                                                 ev_link);
5628
5629                                 NEWCONNSOCK(dev)->references--;
5630                                 free_socket((isc__socket_t **)&dev->newsocket);
5631
5632                                 dev->result = ISC_R_CANCELED;
5633                                 dev->ev_sender = sock;
5634                                 isc_task_sendanddetach(&current_task,
5635                                                        ISC_EVENT_PTR(&dev));
5636                         }
5637
5638                         dev = next;
5639                 }
5640         }
5641
5642         /*
5643          * Connecting is not a list.
5644          */
5645         if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
5646             && sock->connect_ev != NULL) {
5647                 isc_socket_connev_t    *dev;
5648                 isc_task_t             *current_task;
5649
5650                 INSIST(sock->connecting);
5651                 sock->connecting = 0;
5652
5653                 dev = sock->connect_ev;
5654                 current_task = dev->ev_sender;
5655
5656                 if ((task == NULL) || (task == current_task)) {
5657                         sock->connect_ev = NULL;
5658
5659                         dev->result = ISC_R_CANCELED;
5660                         dev->ev_sender = sock;
5661                         isc_task_sendanddetach(&current_task,
5662                                                ISC_EVENT_PTR(&dev));
5663                 }
5664         }
5665
5666         UNLOCK(&sock->lock);
5667 }
5668
5669 ISC_SOCKETFUNC_SCOPE isc_sockettype_t
5670 isc__socket_gettype(isc_socket_t *sock0) {
5671         isc__socket_t *sock = (isc__socket_t *)sock0;
5672
5673         REQUIRE(VALID_SOCKET(sock));
5674
5675         return (sock->type);
5676 }
5677
5678 ISC_SOCKETFUNC_SCOPE isc_boolean_t
5679 isc__socket_isbound(isc_socket_t *sock0) {
5680         isc__socket_t *sock = (isc__socket_t *)sock0;
5681         isc_boolean_t val;
5682
5683         REQUIRE(VALID_SOCKET(sock));
5684
5685         LOCK(&sock->lock);
5686         val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
5687         UNLOCK(&sock->lock);
5688
5689         return (val);
5690 }
5691
5692 ISC_SOCKETFUNC_SCOPE void
5693 isc__socket_ipv6only(isc_socket_t *sock0, isc_boolean_t yes) {
5694         isc__socket_t *sock = (isc__socket_t *)sock0;
5695 #if defined(IPV6_V6ONLY)
5696         int onoff = yes ? 1 : 0;
5697 #else
5698         UNUSED(yes);
5699         UNUSED(sock);
5700 #endif
5701
5702         REQUIRE(VALID_SOCKET(sock));
5703
5704 #ifdef IPV6_V6ONLY
5705         if (sock->pf == AF_INET6) {
5706                 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5707                                (void *)&onoff, sizeof(int)) < 0) {
5708                         char strbuf[ISC_STRERRORSIZE];
5709                         isc__strerror(errno, strbuf, sizeof(strbuf));
5710                         UNEXPECTED_ERROR(__FILE__, __LINE__,
5711                                          "setsockopt(%d, IPV6_V6ONLY) "
5712                                          "%s: %s", sock->fd,
5713                                          isc_msgcat_get(isc_msgcat,
5714                                                         ISC_MSGSET_GENERAL,
5715                                                         ISC_MSG_FAILED,
5716                                                         "failed"),
5717                                          strbuf);
5718                 }
5719         }
5720         FIX_IPV6_RECVPKTINFO(sock);     /* AIX */
5721 #endif
5722 }
5723
5724 #ifndef USE_WATCHER_THREAD
5725 /*
5726  * In our assumed scenario, we can simply use a single static object.
5727  * XXX: this is not true if the application uses multiple threads with
5728  *      'multi-context' mode.  Fixing this is a future TODO item.
5729  */
5730 static isc_socketwait_t swait_private;
5731
5732 int
5733 isc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
5734                           isc_socketwait_t **swaitp)
5735 {
5736         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
5737
5738
5739         int n;
5740 #ifdef USE_KQUEUE
5741         struct timespec ts, *tsp;
5742 #endif
5743 #ifdef USE_EPOLL
5744         int timeout;
5745 #endif
5746 #ifdef USE_DEVPOLL
5747         struct dvpoll dvp;
5748 #endif
5749
5750         REQUIRE(swaitp != NULL && *swaitp == NULL);
5751
5752 #ifdef USE_SHARED_MANAGER
5753         if (manager == NULL)
5754                 manager = socketmgr;
5755 #endif
5756         if (manager == NULL)
5757                 return (0);
5758
5759 #ifdef USE_KQUEUE
5760         if (tvp != NULL) {
5761                 ts.tv_sec = tvp->tv_sec;
5762                 ts.tv_nsec = tvp->tv_usec * 1000;
5763                 tsp = &ts;
5764         } else
5765                 tsp = NULL;
5766         swait_private.nevents = kevent(manager->kqueue_fd, NULL, 0,
5767                                        manager->events, manager->nevents,
5768                                        tsp);
5769         n = swait_private.nevents;
5770 #elif defined(USE_EPOLL)
5771         if (tvp != NULL)
5772                 timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000;
5773         else
5774                 timeout = -1;
5775         swait_private.nevents = epoll_wait(manager->epoll_fd,
5776                                            manager->events,
5777                                            manager->nevents, timeout);
5778         n = swait_private.nevents;
5779 #elif defined(USE_DEVPOLL)
5780         dvp.dp_fds = manager->events;
5781         dvp.dp_nfds = manager->nevents;
5782         if (tvp != NULL) {
5783                 dvp.dp_timeout = tvp->tv_sec * 1000 +
5784                         (tvp->tv_usec + 999) / 1000;
5785         } else
5786                 dvp.dp_timeout = -1;
5787         swait_private.nevents = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
5788         n = swait_private.nevents;
5789 #elif defined(USE_SELECT)
5790         memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize);
5791         memmove(manager->write_fds_copy, manager->write_fds,
5792                 manager->fd_bufsize);
5793
5794         swait_private.readset = manager->read_fds_copy;
5795         swait_private.writeset = manager->write_fds_copy;
5796         swait_private.maxfd = manager->maxfd + 1;
5797
5798         n = select(swait_private.maxfd, swait_private.readset,
5799                    swait_private.writeset, NULL, tvp);
5800 #endif
5801
5802         *swaitp = &swait_private;
5803         return (n);
5804 }
5805
5806 isc_result_t
5807 isc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
5808         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
5809
5810         REQUIRE(swait == &swait_private);
5811
5812 #ifdef USE_SHARED_MANAGER
5813         if (manager == NULL)
5814                 manager = socketmgr;
5815 #endif
5816         if (manager == NULL)
5817                 return (ISC_R_NOTFOUND);
5818
5819 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
5820         (void)process_fds(manager, manager->events, swait->nevents);
5821         return (ISC_R_SUCCESS);
5822 #elif defined(USE_SELECT)
5823         process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
5824         return (ISC_R_SUCCESS);
5825 #endif
5826 }
5827 #endif /* USE_WATCHER_THREAD */
5828
5829 #ifdef BIND9
5830 void
5831 isc__socket_setname(isc_socket_t *socket0, const char *name, void *tag) {
5832         isc__socket_t *socket = (isc__socket_t *)socket0;
5833
5834         /*
5835          * Name 'socket'.
5836          */
5837
5838         REQUIRE(VALID_SOCKET(socket));
5839
5840         LOCK(&socket->lock);
5841         memset(socket->name, 0, sizeof(socket->name));
5842         strncpy(socket->name, name, sizeof(socket->name) - 1);
5843         socket->tag = tag;
5844         UNLOCK(&socket->lock);
5845 }
5846
5847 ISC_SOCKETFUNC_SCOPE const char *
5848 isc__socket_getname(isc_socket_t *socket0) {
5849         isc__socket_t *socket = (isc__socket_t *)socket0;
5850
5851         return (socket->name);
5852 }
5853
5854 void *
5855 isc__socket_gettag(isc_socket_t *socket0) {
5856         isc__socket_t *socket = (isc__socket_t *)socket0;
5857
5858         return (socket->tag);
5859 }
5860 #endif  /* BIND9 */
5861
5862 #ifdef USE_SOCKETIMPREGISTER
5863 isc_result_t
5864 isc__socket_register() {
5865         return (isc_socket_register(isc__socketmgr_create));
5866 }
5867 #endif
5868
5869 #if defined(HAVE_LIBXML2) && defined(BIND9)
5870
5871 static const char *
5872 _socktype(isc_sockettype_t type)
5873 {
5874         if (type == isc_sockettype_udp)
5875                 return ("udp");
5876         else if (type == isc_sockettype_tcp)
5877                 return ("tcp");
5878         else if (type == isc_sockettype_unix)
5879                 return ("unix");
5880         else if (type == isc_sockettype_fdwatch)
5881                 return ("fdwatch");
5882         else
5883                 return ("not-initialized");
5884 }
5885
5886 #define TRY0(a) do { xmlrc = (a); if (xmlrc < 0) goto error; } while(0)
5887 ISC_SOCKETFUNC_SCOPE int
5888 isc_socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer) {
5889         isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0;
5890         isc__socket_t *sock = NULL;
5891         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5892         isc_sockaddr_t addr;
5893         ISC_SOCKADDR_LEN_T len;
5894         int xmlrc;
5895
5896         LOCK(&mgr->lock);
5897
5898 #ifdef USE_SHARED_MANAGER
5899         TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "references"));
5900         TRY0(xmlTextWriterWriteFormatString(writer, "%d", mgr->refs));
5901         TRY0(xmlTextWriterEndElement(writer));
5902 #endif  /* USE_SHARED_MANAGER */
5903
5904         TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"));
5905         sock = ISC_LIST_HEAD(mgr->socklist);
5906         while (sock != NULL) {
5907                 LOCK(&sock->lock);
5908                 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"));
5909
5910                 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"));
5911                 TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock));
5912                 TRY0(xmlTextWriterEndElement(writer));
5913
5914                 if (sock->name[0] != 0) {
5915                         TRY0(xmlTextWriterStartElement(writer,
5916                                                        ISC_XMLCHAR "name"));
5917                         TRY0(xmlTextWriterWriteFormatString(writer, "%s",
5918                                                             sock->name));
5919                         TRY0(xmlTextWriterEndElement(writer)); /* name */
5920                 }
5921
5922                 TRY0(xmlTextWriterStartElement(writer,
5923                                                ISC_XMLCHAR "references"));
5924                 TRY0(xmlTextWriterWriteFormatString(writer, "%d",
5925                                                     sock->references));
5926                 TRY0(xmlTextWriterEndElement(writer));
5927
5928                 TRY0(xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
5929                                           ISC_XMLCHAR _socktype(sock->type)));
5930
5931                 if (sock->connected) {
5932                         isc_sockaddr_format(&sock->peer_address, peerbuf,
5933                                             sizeof(peerbuf));
5934                         TRY0(xmlTextWriterWriteElement(writer,
5935                                                   ISC_XMLCHAR "peer-address",
5936                                                   ISC_XMLCHAR peerbuf));
5937                 }
5938
5939                 len = sizeof(addr);
5940                 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5941                         isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5942                         TRY0(xmlTextWriterWriteElement(writer,
5943                                                   ISC_XMLCHAR "local-address",
5944                                                   ISC_XMLCHAR peerbuf));
5945                 }
5946
5947                 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"));
5948                 if (sock->pending_recv)
5949                         TRY0(xmlTextWriterWriteElement(writer,
5950                                                 ISC_XMLCHAR "state",
5951                                                 ISC_XMLCHAR "pending-receive"));
5952                 if (sock->pending_send)
5953                         TRY0(xmlTextWriterWriteElement(writer,
5954                                                   ISC_XMLCHAR "state",
5955                                                   ISC_XMLCHAR "pending-send"));
5956                 if (sock->pending_accept)
5957                         TRY0(xmlTextWriterWriteElement(writer,
5958                                                  ISC_XMLCHAR "state",
5959                                                  ISC_XMLCHAR "pending_accept"));
5960                 if (sock->listener)
5961                         TRY0(xmlTextWriterWriteElement(writer,
5962                                                        ISC_XMLCHAR "state",
5963                                                        ISC_XMLCHAR "listener"));
5964                 if (sock->connected)
5965                         TRY0(xmlTextWriterWriteElement(writer,
5966                                                      ISC_XMLCHAR "state",
5967                                                      ISC_XMLCHAR "connected"));
5968                 if (sock->connecting)
5969                         TRY0(xmlTextWriterWriteElement(writer,
5970                                                     ISC_XMLCHAR "state",
5971                                                     ISC_XMLCHAR "connecting"));
5972                 if (sock->bound)
5973                         TRY0(xmlTextWriterWriteElement(writer,
5974                                                        ISC_XMLCHAR "state",
5975                                                        ISC_XMLCHAR "bound"));
5976
5977                 TRY0(xmlTextWriterEndElement(writer)); /* states */
5978
5979                 TRY0(xmlTextWriterEndElement(writer)); /* socket */
5980
5981                 UNLOCK(&sock->lock);
5982                 sock = ISC_LIST_NEXT(sock, link);
5983         }
5984         TRY0(xmlTextWriterEndElement(writer)); /* sockets */
5985
5986  error:
5987         if (sock != NULL)
5988                 UNLOCK(&sock->lock);
5989
5990         UNLOCK(&mgr->lock);
5991
5992         return (xmlrc);
5993 }
5994 #endif /* HAVE_LIBXML2 */