]> CyberLeo.Net >> Repos - FreeBSD/releng/10.2.git/blob - contrib/ntp/lib/isc/unix/socket.c
- Copy stable/10@285827 to releng/10.2 in preparation for 10.2-RC1
[FreeBSD/releng/10.2.git] / contrib / ntp / lib / isc / unix / socket.c
1 /*
2  * Copyright (C) 2004-2012  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1998-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id$ */
19
20 /*! \file */
21
22 #include <config.h>
23
24 #include <sys/param.h>
25 #include <sys/types.h>
26 #include <sys/socket.h>
27 #include <sys/stat.h>
28 #include <sys/time.h>
29 #include <sys/uio.h>
30
31 #include <errno.h>
32 #include <fcntl.h>
33 #include <stddef.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37
38 #include <isc/buffer.h>
39 #include <isc/bufferlist.h>
40 #include <isc/condition.h>
41 #include <isc/formatcheck.h>
42 #include <isc/list.h>
43 #include <isc/log.h>
44 #include <isc/mem.h>
45 #include <isc/msgs.h>
46 #include <isc/mutex.h>
47 #include <isc/net.h>
48 #include <isc/once.h>
49 #include <isc/platform.h>
50 #include <isc/print.h>
51 #include <isc/region.h>
52 #include <isc/socket.h>
53 #include <isc/stats.h>
54 #include <isc/strerror.h>
55 #include <isc/task.h>
56 #include <isc/thread.h>
57 #include <isc/util.h>
58 #include <isc/xml.h>
59
60 #ifdef ISC_PLATFORM_HAVESYSUNH
61 #include <sys/un.h>
62 #endif
63 #ifdef ISC_PLATFORM_HAVEKQUEUE
64 #include <sys/event.h>
65 #endif
66 #ifdef ISC_PLATFORM_HAVEEPOLL
67 #include <sys/epoll.h>
68 #endif
69 #ifdef ISC_PLATFORM_HAVEDEVPOLL
70 #if defined(HAVE_SYS_DEVPOLL_H)
71 #include <sys/devpoll.h>
72 #elif defined(HAVE_DEVPOLL_H)
73 #include <devpoll.h>
74 #endif
75 #endif
76
77 #include "errno2result.h"
78
79 /* See task.c about the following definition: */
80 #ifdef BIND9
81 #ifdef ISC_PLATFORM_USETHREADS
82 #define USE_WATCHER_THREAD
83 #else
84 #define USE_SHARED_MANAGER
85 #endif  /* ISC_PLATFORM_USETHREADS */
86 #endif  /* BIND9 */
87
88 #ifndef USE_WATCHER_THREAD
89 #include "socket_p.h"
90 #include "../task_p.h"
91 #endif /* USE_WATCHER_THREAD */
92
93 #if defined(SO_BSDCOMPAT) && defined(__linux__)
94 #include <sys/utsname.h>
95 #endif
96
97 /*%
98  * Choose the most preferable multiplex method.
99  */
100 #ifdef ISC_PLATFORM_HAVEKQUEUE
101 #define USE_KQUEUE
102 #elif defined (ISC_PLATFORM_HAVEEPOLL)
103 #define USE_EPOLL
104 #elif defined (ISC_PLATFORM_HAVEDEVPOLL)
105 #define USE_DEVPOLL
106 typedef struct {
107         unsigned int want_read : 1,
108                 want_write : 1;
109 } pollinfo_t;
110 #else
111 #define USE_SELECT
112 #endif  /* ISC_PLATFORM_HAVEKQUEUE */
113
114 #ifndef USE_WATCHER_THREAD
115 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
116 struct isc_socketwait {
117         int nevents;
118 };
119 #elif defined (USE_SELECT)
120 struct isc_socketwait {
121         fd_set *readset;
122         fd_set *writeset;
123         int nfds;
124         int maxfd;
125 };
126 #endif  /* USE_KQUEUE */
127 #endif /* !USE_WATCHER_THREAD */
128
129 /*%
130  * Maximum number of allowable open sockets.  This is also the maximum
131  * allowable socket file descriptor.
132  *
133  * Care should be taken before modifying this value for select():
134  * The API standard doesn't ensure select() accept more than (the system default
135  * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
136  * the vast majority of cases.  This constant should therefore be increased only
137  * when absolutely necessary and possible, i.e., the server is exhausting all
138  * available file descriptors (up to FD_SETSIZE) and the select() function
139  * and FD_xxx macros support larger values than FD_SETSIZE (which may not
140  * always by true, but we keep using some of them to ensure as much
141  * portability as possible).  Note also that overall server performance
142  * may be rather worsened with a larger value of this constant due to
143  * inherent scalability problems of select().
144  *
145  * As a special note, this value shouldn't have to be touched if
146  * this is a build for an authoritative only DNS server.
147  */
148 #ifndef ISC_SOCKET_MAXSOCKETS
149 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
150 #define ISC_SOCKET_MAXSOCKETS 4096
151 #elif defined(USE_SELECT)
152 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
153 #endif  /* USE_KQUEUE... */
154 #endif  /* ISC_SOCKET_MAXSOCKETS */
155
156 #ifdef USE_SELECT
157 /*%
158  * Mac OS X needs a special definition to support larger values in select().
159  * We always define this because a larger value can be specified run-time.
160  */
161 #ifdef __APPLE__
162 #define _DARWIN_UNLIMITED_SELECT
163 #endif  /* __APPLE__ */
164 #endif  /* USE_SELECT */
165
166 #ifdef ISC_SOCKET_USE_POLLWATCH
167 /*%
168  * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
169  * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
170  * some of the specified FD.  The idea is based on the observation that it's
171  * likely for a busy server to keep receiving packets.  It specifically works
172  * as follows: the socket watcher is first initialized with the state of
173  * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
174  * event occurs.  When it wakes up for a socket I/O event, it moves to the
175  * poll_active state, and sets the poll timeout to a short period
176  * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
177  * watcher goes to the poll_checking state with the same timeout period.
178  * In this state, the watcher tries to detect whether this is a break
179  * during intermittent events or the kernel bug is triggered.  If the next
180  * polling reports an event within the short period, the previous timeout is
181  * likely to be a kernel bug, and so the watcher goes back to the active state.
182  * Otherwise, it moves to the idle state again.
183  *
184  * It's not clear whether this is a thread-related bug, but since we've only
185  * seen this with threads, this workaround is used only when enabling threads.
186  */
187
188 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
189
190 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
191 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
192 #endif  /* ISC_SOCKET_POLLWATCH_TIMEOUT */
193 #endif  /* ISC_SOCKET_USE_POLLWATCH */
194
195 /*%
196  * Size of per-FD lock buckets.
197  */
198 #ifdef ISC_PLATFORM_USETHREADS
199 #define FDLOCK_COUNT            1024
200 #define FDLOCK_ID(fd)           ((fd) % FDLOCK_COUNT)
201 #else
202 #define FDLOCK_COUNT            1
203 #define FDLOCK_ID(fd)           0
204 #endif  /* ISC_PLATFORM_USETHREADS */
205
206 /*%
207  * Maximum number of events communicated with the kernel.  There should normally
208  * be no need for having a large number.
209  */
210 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
211 #ifndef ISC_SOCKET_MAXEVENTS
212 #define ISC_SOCKET_MAXEVENTS    64
213 #endif
214 #endif
215
216 /*%
217  * Some systems define the socket length argument as an int, some as size_t,
218  * some as socklen_t.  This is here so it can be easily changed if needed.
219  */
220 #ifndef ISC_SOCKADDR_LEN_T
221 #define ISC_SOCKADDR_LEN_T unsigned int
222 #endif
223
224 /*%
225  * Define what the possible "soft" errors can be.  These are non-fatal returns
226  * of various network related functions, like recv() and so on.
227  *
228  * For some reason, BSDI (and perhaps others) will sometimes return <0
229  * from recv() but will have errno==0.  This is broken, but we have to
230  * work around it here.
231  */
232 #define SOFT_ERROR(e)   ((e) == EAGAIN || \
233                          (e) == EWOULDBLOCK || \
234                          (e) == EINTR || \
235                          (e) == 0)
236
237 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
238
239 /*!<
240  * DLVL(90)  --  Function entry/exit and other tracing.
241  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
242  * DLVL(60)  --  Socket data send/receive
243  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
244  * DLVL(20)  --  Socket creation/destruction.
245  */
246 #define TRACE_LEVEL             90
247 #define CORRECTNESS_LEVEL       70
248 #define IOEVENT_LEVEL           60
249 #define EVENT_LEVEL             50
250 #define CREATION_LEVEL          20
251
252 #define TRACE           DLVL(TRACE_LEVEL)
253 #define CORRECTNESS     DLVL(CORRECTNESS_LEVEL)
254 #define IOEVENT         DLVL(IOEVENT_LEVEL)
255 #define EVENT           DLVL(EVENT_LEVEL)
256 #define CREATION        DLVL(CREATION_LEVEL)
257
258 typedef isc_event_t intev_t;
259
260 #define SOCKET_MAGIC            ISC_MAGIC('I', 'O', 'i', 'o')
261 #define VALID_SOCKET(s)         ISC_MAGIC_VALID(s, SOCKET_MAGIC)
262
263 /*!
264  * IPv6 control information.  If the socket is an IPv6 socket we want
265  * to collect the destination address and interface so the client can
266  * set them on outgoing packets.
267  */
268 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
269 #ifndef USE_CMSG
270 #define USE_CMSG        1
271 #endif
272 #endif
273
274 /*%
275  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
276  * a setsockopt() like interface to request timestamps, and if the OS
277  * doesn't do it for us, call gettimeofday() on every UDP receive?
278  */
279 #ifdef SO_TIMESTAMP
280 #ifndef USE_CMSG
281 #define USE_CMSG        1
282 #endif
283 #endif
284
285 /*%
286  * The size to raise the receive buffer to (from BIND 8).
287  */
288 #define RCVBUFSIZE (32*1024)
289
290 /*%
291  * The number of times a send operation is repeated if the result is EINTR.
292  */
293 #define NRETRIES 10
294
295 typedef struct isc__socket isc__socket_t;
296 typedef struct isc__socketmgr isc__socketmgr_t;
297
298 #define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)
299
300 struct isc__socket {
301         /* Not locked. */
302         isc_socket_t            common;
303         isc__socketmgr_t        *manager;
304         isc_mutex_t             lock;
305         isc_sockettype_t        type;
306         const isc_statscounter_t        *statsindex;
307
308         /* Locked by socket lock. */
309         ISC_LINK(isc__socket_t) link;
310         unsigned int            references;
311         int                     fd;
312         int                     pf;
313         char                            name[16];
314         void *                          tag;
315
316         ISC_LIST(isc_socketevent_t)             send_list;
317         ISC_LIST(isc_socketevent_t)             recv_list;
318         ISC_LIST(isc_socket_newconnev_t)        accept_list;
319         isc_socket_connev_t                    *connect_ev;
320
321         /*
322          * Internal events.  Posted when a descriptor is readable or
323          * writable.  These are statically allocated and never freed.
324          * They will be set to non-purgable before use.
325          */
326         intev_t                 readable_ev;
327         intev_t                 writable_ev;
328
329         isc_sockaddr_t          peer_address;  /* remote address */
330
331         unsigned int            pending_recv : 1,
332                                 pending_send : 1,
333                                 pending_accept : 1,
334                                 listener : 1, /* listener socket */
335                                 connected : 1,
336                                 connecting : 1, /* connect pending */
337                                 bound : 1, /* bound to local addr */
338                                 dupped : 1;
339
340 #ifdef ISC_NET_RECVOVERFLOW
341         unsigned char           overflow; /* used for MSG_TRUNC fake */
342 #endif
343
344         char                    *recvcmsgbuf;
345         ISC_SOCKADDR_LEN_T      recvcmsgbuflen;
346         char                    *sendcmsgbuf;
347         ISC_SOCKADDR_LEN_T      sendcmsgbuflen;
348
349         void                    *fdwatcharg;
350         isc_sockfdwatch_t       fdwatchcb;
351         int                     fdwatchflags;
352         isc_task_t              *fdwatchtask;
353 };
354
355 #define SOCKET_MANAGER_MAGIC    ISC_MAGIC('I', 'O', 'm', 'g')
356 #define VALID_MANAGER(m)        ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
357
358 struct isc__socketmgr {
359         /* Not locked. */
360         isc_socketmgr_t         common;
361         isc_mem_t              *mctx;
362         isc_mutex_t             lock;
363         isc_mutex_t             *fdlock;
364         isc_stats_t             *stats;
365 #ifdef USE_KQUEUE
366         int                     kqueue_fd;
367         int                     nevents;
368         struct kevent           *events;
369 #endif  /* USE_KQUEUE */
370 #ifdef USE_EPOLL
371         int                     epoll_fd;
372         int                     nevents;
373         struct epoll_event      *events;
374 #endif  /* USE_EPOLL */
375 #ifdef USE_DEVPOLL
376         int                     devpoll_fd;
377         int                     nevents;
378         struct pollfd           *events;
379 #endif  /* USE_DEVPOLL */
380 #ifdef USE_SELECT
381         int                     fd_bufsize;
382 #endif  /* USE_SELECT */
383         unsigned int            maxsocks;
384 #ifdef ISC_PLATFORM_USETHREADS
385         int                     pipe_fds[2];
386 #endif
387
388         /* Locked by fdlock. */
389         isc__socket_t          **fds;
390         int                     *fdstate;
391 #ifdef USE_DEVPOLL
392         pollinfo_t              *fdpollinfo;
393 #endif
394
395         /* Locked by manager lock. */
396         ISC_LIST(isc__socket_t) socklist;
397 #ifdef USE_SELECT
398         fd_set                  *read_fds;
399         fd_set                  *read_fds_copy;
400         fd_set                  *write_fds;
401         fd_set                  *write_fds_copy;
402         int                     maxfd;
403 #endif  /* USE_SELECT */
404         int                     reserved;       /* unlocked */
405 #ifdef USE_WATCHER_THREAD
406         isc_thread_t            watcher;
407         isc_condition_t         shutdown_ok;
408 #else /* USE_WATCHER_THREAD */
409         unsigned int            refs;
410 #endif /* USE_WATCHER_THREAD */
411         int                     maxudp;
412 };
413
414 #ifdef USE_SHARED_MANAGER
415 static isc__socketmgr_t *socketmgr = NULL;
416 #endif /* USE_SHARED_MANAGER */
417
418 #define CLOSED                  0       /* this one must be zero */
419 #define MANAGED                 1
420 #define CLOSE_PENDING           2
421
422 /*
423  * send() and recv() iovec counts
424  */
425 #define MAXSCATTERGATHER_SEND   (ISC_SOCKET_MAXSCATTERGATHER)
426 #ifdef ISC_NET_RECVOVERFLOW
427 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER + 1)
428 #else
429 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER)
430 #endif
431
432 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
433                                   isc_sockettype_t type,
434                                   isc_socket_t **socketp,
435                                   isc_socket_t *dup_socket);
436 static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
437 static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
438 static void free_socket(isc__socket_t **);
439 static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
440                                     isc__socket_t **);
441 static void destroy(isc__socket_t **);
442 static void internal_accept(isc_task_t *, isc_event_t *);
443 static void internal_connect(isc_task_t *, isc_event_t *);
444 static void internal_recv(isc_task_t *, isc_event_t *);
445 static void internal_send(isc_task_t *, isc_event_t *);
446 static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
447 static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
448 static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
449 static void build_msghdr_send(isc__socket_t *, isc_socketevent_t *,
450                               struct msghdr *, struct iovec *, size_t *);
451 static void build_msghdr_recv(isc__socket_t *, isc_socketevent_t *,
452                               struct msghdr *, struct iovec *, size_t *);
453 #ifdef USE_WATCHER_THREAD
454 static isc_boolean_t process_ctlfd(isc__socketmgr_t *manager);
455 #endif
456
457 /*%
458  * The following can be either static or public, depending on build environment.
459  */
460
461 #ifdef BIND9
462 #define ISC_SOCKETFUNC_SCOPE
463 #else
464 #define ISC_SOCKETFUNC_SCOPE static
465 #endif
466
467 ISC_SOCKETFUNC_SCOPE isc_result_t
468 isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
469                    isc_socket_t **socketp);
470 ISC_SOCKETFUNC_SCOPE void
471 isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
472 ISC_SOCKETFUNC_SCOPE void
473 isc__socket_detach(isc_socket_t **socketp);
474 ISC_SOCKETFUNC_SCOPE isc_result_t
475 isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp);
476 ISC_SOCKETFUNC_SCOPE isc_result_t
477 isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
478                        unsigned int maxsocks);
479 ISC_SOCKETFUNC_SCOPE void
480 isc__socketmgr_destroy(isc_socketmgr_t **managerp);
481 ISC_SOCKETFUNC_SCOPE isc_result_t
482 isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
483                  unsigned int minimum, isc_task_t *task,
484                   isc_taskaction_t action, const void *arg);
485 ISC_SOCKETFUNC_SCOPE isc_result_t
486 isc__socket_recv(isc_socket_t *sock, isc_region_t *region,
487                  unsigned int minimum, isc_task_t *task,
488                  isc_taskaction_t action, const void *arg);
489 ISC_SOCKETFUNC_SCOPE isc_result_t
490 isc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
491                   unsigned int minimum, isc_task_t *task,
492                   isc_socketevent_t *event, unsigned int flags);
493 ISC_SOCKETFUNC_SCOPE isc_result_t
494 isc__socket_send(isc_socket_t *sock, isc_region_t *region,
495                  isc_task_t *task, isc_taskaction_t action, const void *arg);
496 ISC_SOCKETFUNC_SCOPE isc_result_t
497 isc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
498                    isc_task_t *task, isc_taskaction_t action, const void *arg,
499                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
500 ISC_SOCKETFUNC_SCOPE isc_result_t
501 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
502                   isc_task_t *task, isc_taskaction_t action, const void *arg);
503 ISC_SOCKETFUNC_SCOPE isc_result_t
504 isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
505                     isc_task_t *task, isc_taskaction_t action, const void *arg,
506                     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
507 ISC_SOCKETFUNC_SCOPE isc_result_t
508 isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
509                     isc_task_t *task,
510                     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
511                     isc_socketevent_t *event, unsigned int flags);
512 ISC_SOCKETFUNC_SCOPE void
513 isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active);
514 ISC_SOCKETFUNC_SCOPE isc_result_t
515 isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
516                      isc_uint32_t owner, isc_uint32_t group);
517 ISC_SOCKETFUNC_SCOPE isc_result_t
518 isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
519                  unsigned int options);
520 ISC_SOCKETFUNC_SCOPE isc_result_t
521 isc__socket_filter(isc_socket_t *sock, const char *filter);
522 ISC_SOCKETFUNC_SCOPE isc_result_t
523 isc__socket_listen(isc_socket_t *sock, unsigned int backlog);
524 ISC_SOCKETFUNC_SCOPE isc_result_t
525 isc__socket_accept(isc_socket_t *sock,
526                    isc_task_t *task, isc_taskaction_t action, const void *arg);
527 ISC_SOCKETFUNC_SCOPE isc_result_t
528 isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
529                     isc_task_t *task, isc_taskaction_t action,
530                     const void *arg);
531 ISC_SOCKETFUNC_SCOPE isc_result_t
532 isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp);
533 ISC_SOCKETFUNC_SCOPE isc_result_t
534 isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp);
535 ISC_SOCKETFUNC_SCOPE void
536 isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
537 ISC_SOCKETFUNC_SCOPE isc_sockettype_t
538 isc__socket_gettype(isc_socket_t *sock);
539 ISC_SOCKETFUNC_SCOPE isc_boolean_t
540 isc__socket_isbound(isc_socket_t *sock);
541 ISC_SOCKETFUNC_SCOPE void
542 isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes);
543 #if defined(HAVE_LIBXML2) && defined(BIND9)
544 ISC_SOCKETFUNC_SCOPE void
545 isc__socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer);
546 #endif
547
548 ISC_SOCKETFUNC_SCOPE isc_result_t
549 isc__socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
550                           isc_sockfdwatch_t callback, void *cbarg,
551                           isc_task_t *task, isc_socket_t **socketp);
552 ISC_SOCKETFUNC_SCOPE isc_result_t
553 isc__socket_fdwatchpoke(isc_socket_t *sock, int flags);
554 ISC_SOCKETFUNC_SCOPE isc_result_t
555 isc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp);
556 ISC_SOCKETFUNC_SCOPE int
557 isc__socket_getfd(isc_socket_t *sock);
558
559 static struct {
560         isc_socketmethods_t methods;
561
562         /*%
563          * The following are defined just for avoiding unused static functions.
564          */
565 #ifndef BIND9
566         void *recvv, *send, *sendv, *sendto2, *cleanunix, *permunix, *filter,
567                 *listen, *accept, *getpeername, *isbound;
568 #endif
569 } socketmethods = {
570         {
571                 isc__socket_attach,
572                 isc__socket_detach,
573                 isc__socket_bind,
574                 isc__socket_sendto,
575                 isc__socket_connect,
576                 isc__socket_recv,
577                 isc__socket_cancel,
578                 isc__socket_getsockname,
579                 isc__socket_gettype,
580                 isc__socket_ipv6only,
581                 isc__socket_fdwatchpoke,
582                 isc__socket_dup,
583                 isc__socket_getfd
584         }
585 #ifndef BIND9
586         ,
587         (void *)isc__socket_recvv, (void *)isc__socket_send,
588         (void *)isc__socket_sendv, (void *)isc__socket_sendto2,
589         (void *)isc__socket_cleanunix, (void *)isc__socket_permunix,
590         (void *)isc__socket_filter, (void *)isc__socket_listen,
591         (void *)isc__socket_accept, (void *)isc__socket_getpeername,
592         (void *)isc__socket_isbound
593 #endif
594 };
595
596 static isc_socketmgrmethods_t socketmgrmethods = {
597         isc__socketmgr_destroy,
598         isc__socket_create,
599         isc__socket_fdwatchcreate
600 };
601
602 #define SELECT_POKE_SHUTDOWN            (-1)
603 #define SELECT_POKE_NOTHING             (-2)
604 #define SELECT_POKE_READ                (-3)
605 #define SELECT_POKE_ACCEPT              (-3) /*%< Same as _READ */
606 #define SELECT_POKE_WRITE               (-4)
607 #define SELECT_POKE_CONNECT             (-4) /*%< Same as _WRITE */
608 #define SELECT_POKE_CLOSE               (-5)
609
610 #define SOCK_DEAD(s)                    ((s)->references == 0)
611
612 /*%
613  * Shortcut index arrays to get access to statistics counters.
614  */
615 enum {
616         STATID_OPEN = 0,
617         STATID_OPENFAIL = 1,
618         STATID_CLOSE = 2,
619         STATID_BINDFAIL = 3,
620         STATID_CONNECTFAIL = 4,
621         STATID_CONNECT = 5,
622         STATID_ACCEPTFAIL = 6,
623         STATID_ACCEPT = 7,
624         STATID_SENDFAIL = 8,
625         STATID_RECVFAIL = 9
626 };
627 static const isc_statscounter_t upd4statsindex[] = {
628         isc_sockstatscounter_udp4open,
629         isc_sockstatscounter_udp4openfail,
630         isc_sockstatscounter_udp4close,
631         isc_sockstatscounter_udp4bindfail,
632         isc_sockstatscounter_udp4connectfail,
633         isc_sockstatscounter_udp4connect,
634         -1,
635         -1,
636         isc_sockstatscounter_udp4sendfail,
637         isc_sockstatscounter_udp4recvfail
638 };
639 static const isc_statscounter_t upd6statsindex[] = {
640         isc_sockstatscounter_udp6open,
641         isc_sockstatscounter_udp6openfail,
642         isc_sockstatscounter_udp6close,
643         isc_sockstatscounter_udp6bindfail,
644         isc_sockstatscounter_udp6connectfail,
645         isc_sockstatscounter_udp6connect,
646         -1,
647         -1,
648         isc_sockstatscounter_udp6sendfail,
649         isc_sockstatscounter_udp6recvfail
650 };
651 static const isc_statscounter_t tcp4statsindex[] = {
652         isc_sockstatscounter_tcp4open,
653         isc_sockstatscounter_tcp4openfail,
654         isc_sockstatscounter_tcp4close,
655         isc_sockstatscounter_tcp4bindfail,
656         isc_sockstatscounter_tcp4connectfail,
657         isc_sockstatscounter_tcp4connect,
658         isc_sockstatscounter_tcp4acceptfail,
659         isc_sockstatscounter_tcp4accept,
660         isc_sockstatscounter_tcp4sendfail,
661         isc_sockstatscounter_tcp4recvfail
662 };
663 static const isc_statscounter_t tcp6statsindex[] = {
664         isc_sockstatscounter_tcp6open,
665         isc_sockstatscounter_tcp6openfail,
666         isc_sockstatscounter_tcp6close,
667         isc_sockstatscounter_tcp6bindfail,
668         isc_sockstatscounter_tcp6connectfail,
669         isc_sockstatscounter_tcp6connect,
670         isc_sockstatscounter_tcp6acceptfail,
671         isc_sockstatscounter_tcp6accept,
672         isc_sockstatscounter_tcp6sendfail,
673         isc_sockstatscounter_tcp6recvfail
674 };
675 static const isc_statscounter_t unixstatsindex[] = {
676         isc_sockstatscounter_unixopen,
677         isc_sockstatscounter_unixopenfail,
678         isc_sockstatscounter_unixclose,
679         isc_sockstatscounter_unixbindfail,
680         isc_sockstatscounter_unixconnectfail,
681         isc_sockstatscounter_unixconnect,
682         isc_sockstatscounter_unixacceptfail,
683         isc_sockstatscounter_unixaccept,
684         isc_sockstatscounter_unixsendfail,
685         isc_sockstatscounter_unixrecvfail
686 };
687 static const isc_statscounter_t fdwatchstatsindex[] = {
688         -1,
689         -1,
690         isc_sockstatscounter_fdwatchclose,
691         isc_sockstatscounter_fdwatchbindfail,
692         isc_sockstatscounter_fdwatchconnectfail,
693         isc_sockstatscounter_fdwatchconnect,
694         -1,
695         -1,
696         isc_sockstatscounter_fdwatchsendfail,
697         isc_sockstatscounter_fdwatchrecvfail
698 };
699
700 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) || \
701     defined(USE_WATCHER_THREAD)
702 static void
703 manager_log(isc__socketmgr_t *sockmgr,
704             isc_logcategory_t *category, isc_logmodule_t *module, int level,
705             const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
706 static void
707 manager_log(isc__socketmgr_t *sockmgr,
708             isc_logcategory_t *category, isc_logmodule_t *module, int level,
709             const char *fmt, ...)
710 {
711         char msgbuf[2048];
712         va_list ap;
713
714         if (! isc_log_wouldlog(isc_lctx, level))
715                 return;
716
717         va_start(ap, fmt);
718         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
719         va_end(ap);
720
721         isc_log_write(isc_lctx, category, module, level,
722                       "sockmgr %p: %s", sockmgr, msgbuf);
723 }
724 #endif
725
726 static void
727 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
728            isc_logcategory_t *category, isc_logmodule_t *module, int level,
729            isc_msgcat_t *msgcat, int msgset, int message,
730            const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
731 static void
732 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
733            isc_logcategory_t *category, isc_logmodule_t *module, int level,
734            isc_msgcat_t *msgcat, int msgset, int message,
735            const char *fmt, ...)
736 {
737         char msgbuf[2048];
738         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
739         va_list ap;
740
741         if (! isc_log_wouldlog(isc_lctx, level))
742                 return;
743
744         va_start(ap, fmt);
745         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
746         va_end(ap);
747
748         if (address == NULL) {
749                 isc_log_iwrite(isc_lctx, category, module, level,
750                                msgcat, msgset, message,
751                                "socket %p: %s", sock, msgbuf);
752         } else {
753                 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
754                 isc_log_iwrite(isc_lctx, category, module, level,
755                                msgcat, msgset, message,
756                                "socket %p %s: %s", sock, peerbuf, msgbuf);
757         }
758 }
759
760 #if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
761     defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
762 /*
763  * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
764  * setting IPV6_V6ONLY.
765  */
766 static void
767 FIX_IPV6_RECVPKTINFO(isc__socket_t *sock)
768 {
769         char strbuf[ISC_STRERRORSIZE];
770         int on = 1;
771
772         if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
773                 return;
774
775         if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
776                        (void *)&on, sizeof(on)) < 0) {
777
778                 isc__strerror(errno, strbuf, sizeof(strbuf));
779                 UNEXPECTED_ERROR(__FILE__, __LINE__,
780                                  "setsockopt(%d, IPV6_RECVPKTINFO) "
781                                  "%s: %s", sock->fd,
782                                  isc_msgcat_get(isc_msgcat,
783                                                 ISC_MSGSET_GENERAL,
784                                                 ISC_MSG_FAILED,
785                                                 "failed"),
786                                  strbuf);
787         }
788 }
789 #else
790 #define FIX_IPV6_RECVPKTINFO(sock) (void)0
791 #endif
792
793 /*%
794  * Increment socket-related statistics counters.
795  */
796 static inline void
797 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
798         REQUIRE(counterid != -1);
799
800         if (stats != NULL)
801                 isc_stats_increment(stats, counterid);
802 }
803
804 static inline isc_result_t
805 watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
806         isc_result_t result = ISC_R_SUCCESS;
807
808 #ifdef USE_KQUEUE
809         struct kevent evchange;
810
811         memset(&evchange, 0, sizeof(evchange));
812         if (msg == SELECT_POKE_READ)
813                 evchange.filter = EVFILT_READ;
814         else
815                 evchange.filter = EVFILT_WRITE;
816         evchange.flags = EV_ADD;
817         evchange.ident = fd;
818         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
819                 result = isc__errno2result(errno);
820
821         return (result);
822 #elif defined(USE_EPOLL)
823         struct epoll_event event;
824
825         if (msg == SELECT_POKE_READ)
826                 event.events = EPOLLIN;
827         else
828                 event.events = EPOLLOUT;
829         memset(&event.data, 0, sizeof(event.data));
830         event.data.fd = fd;
831         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
832             errno != EEXIST) {
833                 result = isc__errno2result(errno);
834         }
835
836         return (result);
837 #elif defined(USE_DEVPOLL)
838         struct pollfd pfd;
839         int lockid = FDLOCK_ID(fd);
840
841         memset(&pfd, 0, sizeof(pfd));
842         if (msg == SELECT_POKE_READ)
843                 pfd.events = POLLIN;
844         else
845                 pfd.events = POLLOUT;
846         pfd.fd = fd;
847         pfd.revents = 0;
848         LOCK(&manager->fdlock[lockid]);
849         if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
850                 result = isc__errno2result(errno);
851         else {
852                 if (msg == SELECT_POKE_READ)
853                         manager->fdpollinfo[fd].want_read = 1;
854                 else
855                         manager->fdpollinfo[fd].want_write = 1;
856         }
857         UNLOCK(&manager->fdlock[lockid]);
858
859         return (result);
860 #elif defined(USE_SELECT)
861         LOCK(&manager->lock);
862         if (msg == SELECT_POKE_READ)
863                 FD_SET(fd, manager->read_fds);
864         if (msg == SELECT_POKE_WRITE)
865                 FD_SET(fd, manager->write_fds);
866         UNLOCK(&manager->lock);
867
868         return (result);
869 #endif
870 }
871
872 static inline isc_result_t
873 unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
874         isc_result_t result = ISC_R_SUCCESS;
875
876 #ifdef USE_KQUEUE
877         struct kevent evchange;
878
879         memset(&evchange, 0, sizeof(evchange));
880         if (msg == SELECT_POKE_READ)
881                 evchange.filter = EVFILT_READ;
882         else
883                 evchange.filter = EVFILT_WRITE;
884         evchange.flags = EV_DELETE;
885         evchange.ident = fd;
886         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
887                 result = isc__errno2result(errno);
888
889         return (result);
890 #elif defined(USE_EPOLL)
891         struct epoll_event event;
892
893         if (msg == SELECT_POKE_READ)
894                 event.events = EPOLLIN;
895         else
896                 event.events = EPOLLOUT;
897         memset(&event.data, 0, sizeof(event.data));
898         event.data.fd = fd;
899         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
900             errno != ENOENT) {
901                 char strbuf[ISC_STRERRORSIZE];
902                 isc__strerror(errno, strbuf, sizeof(strbuf));
903                 UNEXPECTED_ERROR(__FILE__, __LINE__,
904                                  "epoll_ctl(DEL), %d: %s", fd, strbuf);
905                 result = ISC_R_UNEXPECTED;
906         }
907         return (result);
908 #elif defined(USE_DEVPOLL)
909         struct pollfd pfds[2];
910         size_t writelen = sizeof(pfds[0]);
911         int lockid = FDLOCK_ID(fd);
912
913         memset(pfds, 0, sizeof(pfds));
914         pfds[0].events = POLLREMOVE;
915         pfds[0].fd = fd;
916
917         /*
918          * Canceling read or write polling via /dev/poll is tricky.  Since it
919          * only provides a way of canceling per FD, we may need to re-poll the
920          * socket for the other operation.
921          */
922         LOCK(&manager->fdlock[lockid]);
923         if (msg == SELECT_POKE_READ &&
924             manager->fdpollinfo[fd].want_write == 1) {
925                 pfds[1].events = POLLOUT;
926                 pfds[1].fd = fd;
927                 writelen += sizeof(pfds[1]);
928         }
929         if (msg == SELECT_POKE_WRITE &&
930             manager->fdpollinfo[fd].want_read == 1) {
931                 pfds[1].events = POLLIN;
932                 pfds[1].fd = fd;
933                 writelen += sizeof(pfds[1]);
934         }
935
936         if (write(manager->devpoll_fd, pfds, writelen) == -1)
937                 result = isc__errno2result(errno);
938         else {
939                 if (msg == SELECT_POKE_READ)
940                         manager->fdpollinfo[fd].want_read = 0;
941                 else
942                         manager->fdpollinfo[fd].want_write = 0;
943         }
944         UNLOCK(&manager->fdlock[lockid]);
945
946         return (result);
947 #elif defined(USE_SELECT)
948         LOCK(&manager->lock);
949         if (msg == SELECT_POKE_READ)
950                 FD_CLR(fd, manager->read_fds);
951         else if (msg == SELECT_POKE_WRITE)
952                 FD_CLR(fd, manager->write_fds);
953         UNLOCK(&manager->lock);
954
955         return (result);
956 #endif
957 }
958
959 static void
960 wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
961         isc_result_t result;
962         int lockid = FDLOCK_ID(fd);
963
964         /*
965          * This is a wakeup on a socket.  If the socket is not in the
966          * process of being closed, start watching it for either reads
967          * or writes.
968          */
969
970         INSIST(fd >= 0 && fd < (int)manager->maxsocks);
971
972         if (msg == SELECT_POKE_CLOSE) {
973                 /* No one should be updating fdstate, so no need to lock it */
974                 INSIST(manager->fdstate[fd] == CLOSE_PENDING);
975                 manager->fdstate[fd] = CLOSED;
976                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
977                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
978                 (void)close(fd);
979                 return;
980         }
981
982         LOCK(&manager->fdlock[lockid]);
983         if (manager->fdstate[fd] == CLOSE_PENDING) {
984                 UNLOCK(&manager->fdlock[lockid]);
985
986                 /*
987                  * We accept (and ignore) any error from unwatch_fd() as we are
988                  * closing the socket, hoping it doesn't leave dangling state in
989                  * the kernel.
990                  * Note that unwatch_fd() must be called after releasing the
991                  * fdlock; otherwise it could cause deadlock due to a lock order
992                  * reversal.
993                  */
994                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
995                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
996                 return;
997         }
998         if (manager->fdstate[fd] != MANAGED) {
999                 UNLOCK(&manager->fdlock[lockid]);
1000                 return;
1001         }
1002         UNLOCK(&manager->fdlock[lockid]);
1003
1004         /*
1005          * Set requested bit.
1006          */
1007         result = watch_fd(manager, fd, msg);
1008         if (result != ISC_R_SUCCESS) {
1009                 /*
1010                  * XXXJT: what should we do?  Ignoring the failure of watching
1011                  * a socket will make the application dysfunctional, but there
1012                  * seems to be no reasonable recovery process.
1013                  */
1014                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1015                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1016                               "failed to start watching FD (%d): %s",
1017                               fd, isc_result_totext(result));
1018         }
1019 }
1020
1021 #ifdef USE_WATCHER_THREAD
1022 /*
1023  * Poke the select loop when there is something for us to do.
1024  * The write is required (by POSIX) to complete.  That is, we
1025  * will not get partial writes.
1026  */
1027 static void
1028 select_poke(isc__socketmgr_t *mgr, int fd, int msg) {
1029         int cc;
1030         int buf[2];
1031         char strbuf[ISC_STRERRORSIZE];
1032
1033         buf[0] = fd;
1034         buf[1] = msg;
1035
1036         do {
1037                 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
1038 #ifdef ENOSR
1039                 /*
1040                  * Treat ENOSR as EAGAIN but loop slowly as it is
1041                  * unlikely to clear fast.
1042                  */
1043                 if (cc < 0 && errno == ENOSR) {
1044                         sleep(1);
1045                         errno = EAGAIN;
1046                 }
1047 #endif
1048         } while (cc < 0 && SOFT_ERROR(errno));
1049
1050         if (cc < 0) {
1051                 isc__strerror(errno, strbuf, sizeof(strbuf));
1052                 FATAL_ERROR(__FILE__, __LINE__,
1053                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
1054                                            ISC_MSG_WRITEFAILED,
1055                                            "write() failed "
1056                                            "during watcher poke: %s"),
1057                             strbuf);
1058         }
1059
1060         INSIST(cc == sizeof(buf));
1061 }
1062
1063 /*
1064  * Read a message on the internal fd.
1065  */
1066 static void
1067 select_readmsg(isc__socketmgr_t *mgr, int *fd, int *msg) {
1068         int buf[2];
1069         int cc;
1070         char strbuf[ISC_STRERRORSIZE];
1071
1072         cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
1073         if (cc < 0) {
1074                 *msg = SELECT_POKE_NOTHING;
1075                 *fd = -1;       /* Silence compiler. */
1076                 if (SOFT_ERROR(errno))
1077                         return;
1078
1079                 isc__strerror(errno, strbuf, sizeof(strbuf));
1080                 FATAL_ERROR(__FILE__, __LINE__,
1081                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
1082                                            ISC_MSG_READFAILED,
1083                                            "read() failed "
1084                                            "during watcher poke: %s"),
1085                             strbuf);
1086
1087                 return;
1088         }
1089         INSIST(cc == sizeof(buf));
1090
1091         *fd = buf[0];
1092         *msg = buf[1];
1093 }
1094 #else /* USE_WATCHER_THREAD */
1095 /*
1096  * Update the state of the socketmgr when something changes.
1097  */
1098 static void
1099 select_poke(isc__socketmgr_t *manager, int fd, int msg) {
1100         if (msg == SELECT_POKE_SHUTDOWN)
1101                 return;
1102         else if (fd >= 0)
1103                 wakeup_socket(manager, fd, msg);
1104         return;
1105 }
1106 #endif /* USE_WATCHER_THREAD */
1107
1108 /*
1109  * Make a fd non-blocking.
1110  */
1111 static isc_result_t
1112 make_nonblock(int fd) {
1113         int ret;
1114         int flags;
1115         char strbuf[ISC_STRERRORSIZE];
1116 #ifdef USE_FIONBIO_IOCTL
1117         int on = 1;
1118
1119         ret = ioctl(fd, FIONBIO, (char *)&on);
1120 #else
1121         flags = fcntl(fd, F_GETFL, 0);
1122         flags |= PORT_NONBLOCK;
1123         ret = fcntl(fd, F_SETFL, flags);
1124 #endif
1125
1126         if (ret == -1) {
1127                 isc__strerror(errno, strbuf, sizeof(strbuf));
1128                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1129 #ifdef USE_FIONBIO_IOCTL
1130                                  "ioctl(%d, FIONBIO, &on): %s", fd,
1131 #else
1132                                  "fcntl(%d, F_SETFL, %d): %s", fd, flags,
1133 #endif
1134                                  strbuf);
1135
1136                 return (ISC_R_UNEXPECTED);
1137         }
1138
1139         return (ISC_R_SUCCESS);
1140 }
1141
1142 #ifdef USE_CMSG
1143 /*
1144  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
1145  * In order to ensure as much portability as possible, we provide wrapper
1146  * functions of these macros.
1147  * Note that cmsg_space() could run slow on OSes that do not have
1148  * CMSG_SPACE.
1149  */
1150 static inline ISC_SOCKADDR_LEN_T
1151 cmsg_len(ISC_SOCKADDR_LEN_T len) {
1152 #ifdef CMSG_LEN
1153         return (CMSG_LEN(len));
1154 #else
1155         ISC_SOCKADDR_LEN_T hdrlen;
1156
1157         /*
1158          * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
1159          * is correct.
1160          */
1161         hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
1162         return (hdrlen + len);
1163 #endif
1164 }
1165
1166 static inline ISC_SOCKADDR_LEN_T
1167 cmsg_space(ISC_SOCKADDR_LEN_T len) {
1168 #ifdef CMSG_SPACE
1169         return (CMSG_SPACE(len));
1170 #else
1171         struct msghdr msg;
1172         struct cmsghdr *cmsgp;
1173         /*
1174          * XXX: The buffer length is an ad-hoc value, but should be enough
1175          * in a practical sense.
1176          */
1177         char dummybuf[sizeof(struct cmsghdr) + 1024];
1178
1179         memset(&msg, 0, sizeof(msg));
1180         msg.msg_control = dummybuf;
1181         msg.msg_controllen = sizeof(dummybuf);
1182
1183         cmsgp = (struct cmsghdr *)dummybuf;
1184         cmsgp->cmsg_len = cmsg_len(len);
1185
1186         cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1187         if (cmsgp != NULL)
1188                 return ((char *)cmsgp - (char *)msg.msg_control);
1189         else
1190                 return (0);
1191 #endif
1192 }
1193 #endif /* USE_CMSG */
1194
1195 /*
1196  * Process control messages received on a socket.
1197  */
1198 static void
1199 process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1200 #ifdef USE_CMSG
1201         struct cmsghdr *cmsgp;
1202 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1203         struct in6_pktinfo *pktinfop;
1204 #endif
1205 #ifdef SO_TIMESTAMP
1206         struct timeval *timevalp;
1207 #endif
1208 #endif
1209
1210         /*
1211          * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1212          * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1213          * They are all here, outside of the CPP tests, because it is
1214          * more consistent with the usual ISC coding style.
1215          */
1216         UNUSED(sock);
1217         UNUSED(msg);
1218         UNUSED(dev);
1219
1220 #ifdef ISC_NET_BSD44MSGHDR
1221
1222 #ifdef MSG_TRUNC
1223         if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
1224                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1225 #endif
1226
1227 #ifdef MSG_CTRUNC
1228         if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
1229                 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1230 #endif
1231
1232 #ifndef USE_CMSG
1233         return;
1234 #else
1235         if (msg->msg_controllen == 0U || msg->msg_control == NULL)
1236                 return;
1237
1238 #ifdef SO_TIMESTAMP
1239         timevalp = NULL;
1240 #endif
1241 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1242         pktinfop = NULL;
1243 #endif
1244
1245         cmsgp = CMSG_FIRSTHDR(msg);
1246         while (cmsgp != NULL) {
1247                 socket_log(sock, NULL, TRACE,
1248                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
1249                            "processing cmsg %p", cmsgp);
1250
1251 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1252                 if (cmsgp->cmsg_level == IPPROTO_IPV6
1253                     && cmsgp->cmsg_type == IPV6_PKTINFO) {
1254
1255                         pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1256                         memcpy(&dev->pktinfo, pktinfop,
1257                                sizeof(struct in6_pktinfo));
1258                         dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1259                         socket_log(sock, NULL, TRACE,
1260                                    isc_msgcat, ISC_MSGSET_SOCKET,
1261                                    ISC_MSG_IFRECEIVED,
1262                                    "interface received on ifindex %u",
1263                                    dev->pktinfo.ipi6_ifindex);
1264                         if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
1265                                 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1266                         goto next;
1267                 }
1268 #endif
1269
1270 #ifdef SO_TIMESTAMP
1271                 if (cmsgp->cmsg_level == SOL_SOCKET
1272                     && cmsgp->cmsg_type == SCM_TIMESTAMP) {
1273                         timevalp = (struct timeval *)CMSG_DATA(cmsgp);
1274                         dev->timestamp.seconds = timevalp->tv_sec;
1275                         dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
1276                         dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1277                         goto next;
1278                 }
1279 #endif
1280
1281         next:
1282                 cmsgp = CMSG_NXTHDR(msg, cmsgp);
1283         }
1284 #endif /* USE_CMSG */
1285
1286 #endif /* ISC_NET_BSD44MSGHDR */
1287 }
1288
1289 /*
1290  * Construct an iov array and attach it to the msghdr passed in.  This is
1291  * the SEND constructor, which will use the used region of the buffer
1292  * (if using a buffer list) or will use the internal region (if a single
1293  * buffer I/O is requested).
1294  *
1295  * Nothing can be NULL, and the done event must list at least one buffer
1296  * on the buffer linked list for this function to be meaningful.
1297  *
1298  * If write_countp != NULL, *write_countp will hold the number of bytes
1299  * this transaction can send.
1300  */
1301 static void
1302 build_msghdr_send(isc__socket_t *sock, isc_socketevent_t *dev,
1303                   struct msghdr *msg, struct iovec *iov, size_t *write_countp)
1304 {
1305         unsigned int iovcount;
1306         isc_buffer_t *buffer;
1307         isc_region_t used;
1308         size_t write_count;
1309         size_t skip_count;
1310
1311         memset(msg, 0, sizeof(*msg));
1312
1313         if (!sock->connected) {
1314                 msg->msg_name = (void *)&dev->address.type.sa;
1315                 msg->msg_namelen = dev->address.length;
1316         } else {
1317                 msg->msg_name = NULL;
1318                 msg->msg_namelen = 0;
1319         }
1320
1321         buffer = ISC_LIST_HEAD(dev->bufferlist);
1322         write_count = 0;
1323         iovcount = 0;
1324
1325         /*
1326          * Single buffer I/O?  Skip what we've done so far in this region.
1327          */
1328         if (buffer == NULL) {
1329                 write_count = dev->region.length - dev->n;
1330                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1331                 iov[0].iov_len = write_count;
1332                 iovcount = 1;
1333
1334                 goto config;
1335         }
1336
1337         /*
1338          * Multibuffer I/O.
1339          * Skip the data in the buffer list that we have already written.
1340          */
1341         skip_count = dev->n;
1342         while (buffer != NULL) {
1343                 REQUIRE(ISC_BUFFER_VALID(buffer));
1344                 if (skip_count < isc_buffer_usedlength(buffer))
1345                         break;
1346                 skip_count -= isc_buffer_usedlength(buffer);
1347                 buffer = ISC_LIST_NEXT(buffer, link);
1348         }
1349
1350         while (buffer != NULL) {
1351                 INSIST(iovcount < MAXSCATTERGATHER_SEND);
1352
1353                 isc_buffer_usedregion(buffer, &used);
1354
1355                 if (used.length > 0) {
1356                         iov[iovcount].iov_base = (void *)(used.base
1357                                                           + skip_count);
1358                         iov[iovcount].iov_len = used.length - skip_count;
1359                         write_count += (used.length - skip_count);
1360                         skip_count = 0;
1361                         iovcount++;
1362                 }
1363                 buffer = ISC_LIST_NEXT(buffer, link);
1364         }
1365
1366         INSIST(skip_count == 0U);
1367
1368  config:
1369         msg->msg_iov = iov;
1370         msg->msg_iovlen = iovcount;
1371
1372 #ifdef ISC_NET_BSD44MSGHDR
1373         msg->msg_control = NULL;
1374         msg->msg_controllen = 0;
1375         msg->msg_flags = 0;
1376 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1377         if ((sock->type == isc_sockettype_udp)
1378             && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
1379 #if defined(IPV6_USE_MIN_MTU)
1380                 int use_min_mtu = 1;    /* -1, 0, 1 */
1381 #endif
1382                 struct cmsghdr *cmsgp;
1383                 struct in6_pktinfo *pktinfop;
1384
1385                 socket_log(sock, NULL, TRACE,
1386                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
1387                            "sendto pktinfo data, ifindex %u",
1388                            dev->pktinfo.ipi6_ifindex);
1389
1390                 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1391                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1392                 msg->msg_control = (void *)sock->sendcmsgbuf;
1393
1394                 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
1395                 cmsgp->cmsg_level = IPPROTO_IPV6;
1396                 cmsgp->cmsg_type = IPV6_PKTINFO;
1397                 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1398                 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1399                 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1400 #if defined(IPV6_USE_MIN_MTU)
1401                 /*
1402                  * Set IPV6_USE_MIN_MTU as a per packet option as FreeBSD
1403                  * ignores setsockopt(IPV6_USE_MIN_MTU) when IPV6_PKTINFO
1404                  * is used.
1405                  */
1406                 cmsgp = (struct cmsghdr *)(sock->sendcmsgbuf +
1407                                            msg->msg_controllen);
1408                 msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
1409                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1410
1411                 cmsgp->cmsg_level = IPPROTO_IPV6;
1412                 cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
1413                 cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
1414                 memcpy(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
1415 #endif
1416         }
1417 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
1418 #else /* ISC_NET_BSD44MSGHDR */
1419         msg->msg_accrights = NULL;
1420         msg->msg_accrightslen = 0;
1421 #endif /* ISC_NET_BSD44MSGHDR */
1422
1423         if (write_countp != NULL)
1424                 *write_countp = write_count;
1425 }
1426
1427 /*
1428  * Construct an iov array and attach it to the msghdr passed in.  This is
1429  * the RECV constructor, which will use the available region of the buffer
1430  * (if using a buffer list) or will use the internal region (if a single
1431  * buffer I/O is requested).
1432  *
1433  * Nothing can be NULL, and the done event must list at least one buffer
1434  * on the buffer linked list for this function to be meaningful.
1435  *
1436  * If read_countp != NULL, *read_countp will hold the number of bytes
1437  * this transaction can receive.
1438  */
1439 static void
1440 build_msghdr_recv(isc__socket_t *sock, isc_socketevent_t *dev,
1441                   struct msghdr *msg, struct iovec *iov, size_t *read_countp)
1442 {
1443         unsigned int iovcount;
1444         isc_buffer_t *buffer;
1445         isc_region_t available;
1446         size_t read_count;
1447
1448         memset(msg, 0, sizeof(struct msghdr));
1449
1450         if (sock->type == isc_sockettype_udp) {
1451                 memset(&dev->address, 0, sizeof(dev->address));
1452 #ifdef BROKEN_RECVMSG
1453                 if (sock->pf == AF_INET) {
1454                         msg->msg_name = (void *)&dev->address.type.sin;
1455                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1456                 } else if (sock->pf == AF_INET6) {
1457                         msg->msg_name = (void *)&dev->address.type.sin6;
1458                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1459 #ifdef ISC_PLATFORM_HAVESYSUNH
1460                 } else if (sock->pf == AF_UNIX) {
1461                         msg->msg_name = (void *)&dev->address.type.sunix;
1462                         msg->msg_namelen = sizeof(dev->address.type.sunix);
1463 #endif
1464                 } else {
1465                         msg->msg_name = (void *)&dev->address.type.sa;
1466                         msg->msg_namelen = sizeof(dev->address.type);
1467                 }
1468 #else
1469                 msg->msg_name = (void *)&dev->address.type.sa;
1470                 msg->msg_namelen = sizeof(dev->address.type);
1471 #endif
1472 #ifdef ISC_NET_RECVOVERFLOW
1473                 /* If needed, steal one iovec for overflow detection. */
1474                 maxiov--;
1475 #endif
1476         } else { /* TCP */
1477                 msg->msg_name = NULL;
1478                 msg->msg_namelen = 0;
1479                 dev->address = sock->peer_address;
1480         }
1481
1482         buffer = ISC_LIST_HEAD(dev->bufferlist);
1483         read_count = 0;
1484
1485         /*
1486          * Single buffer I/O?  Skip what we've done so far in this region.
1487          */
1488         if (buffer == NULL) {
1489                 read_count = dev->region.length - dev->n;
1490                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1491                 iov[0].iov_len = read_count;
1492                 iovcount = 1;
1493
1494                 goto config;
1495         }
1496
1497         /*
1498          * Multibuffer I/O.
1499          * Skip empty buffers.
1500          */
1501         while (buffer != NULL) {
1502                 REQUIRE(ISC_BUFFER_VALID(buffer));
1503                 if (isc_buffer_availablelength(buffer) != 0)
1504                         break;
1505                 buffer = ISC_LIST_NEXT(buffer, link);
1506         }
1507
1508         iovcount = 0;
1509         while (buffer != NULL) {
1510                 INSIST(iovcount < MAXSCATTERGATHER_RECV);
1511
1512                 isc_buffer_availableregion(buffer, &available);
1513
1514                 if (available.length > 0) {
1515                         iov[iovcount].iov_base = (void *)(available.base);
1516                         iov[iovcount].iov_len = available.length;
1517                         read_count += available.length;
1518                         iovcount++;
1519                 }
1520                 buffer = ISC_LIST_NEXT(buffer, link);
1521         }
1522
1523  config:
1524
1525         /*
1526          * If needed, set up to receive that one extra byte.  Note that
1527          * we know there is at least one iov left, since we stole it
1528          * at the top of this function.
1529          */
1530 #ifdef ISC_NET_RECVOVERFLOW
1531         if (sock->type == isc_sockettype_udp) {
1532                 iov[iovcount].iov_base = (void *)(&sock->overflow);
1533                 iov[iovcount].iov_len = 1;
1534                 iovcount++;
1535         }
1536 #endif
1537
1538         msg->msg_iov = iov;
1539         msg->msg_iovlen = iovcount;
1540
1541 #ifdef ISC_NET_BSD44MSGHDR
1542         msg->msg_control = NULL;
1543         msg->msg_controllen = 0;
1544         msg->msg_flags = 0;
1545 #if defined(USE_CMSG)
1546         if (sock->type == isc_sockettype_udp) {
1547                 msg->msg_control = sock->recvcmsgbuf;
1548                 msg->msg_controllen = sock->recvcmsgbuflen;
1549         }
1550 #endif /* USE_CMSG */
1551 #else /* ISC_NET_BSD44MSGHDR */
1552         msg->msg_accrights = NULL;
1553         msg->msg_accrightslen = 0;
1554 #endif /* ISC_NET_BSD44MSGHDR */
1555
1556         if (read_countp != NULL)
1557                 *read_countp = read_count;
1558 }
1559
1560 static void
1561 set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock,
1562                 isc_socketevent_t *dev)
1563 {
1564         if (sock->type == isc_sockettype_udp) {
1565                 if (address != NULL)
1566                         dev->address = *address;
1567                 else
1568                         dev->address = sock->peer_address;
1569         } else if (sock->type == isc_sockettype_tcp) {
1570                 INSIST(address == NULL);
1571                 dev->address = sock->peer_address;
1572         }
1573 }
1574
1575 static void
1576 destroy_socketevent(isc_event_t *event) {
1577         isc_socketevent_t *ev = (isc_socketevent_t *)event;
1578
1579         INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1580
1581         (ev->destroy)(event);
1582 }
1583
1584 static isc_socketevent_t *
1585 allocate_socketevent(isc__socket_t *sock, isc_eventtype_t eventtype,
1586                      isc_taskaction_t action, const void *arg)
1587 {
1588         isc_socketevent_t *ev;
1589
1590         ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1591                                                      sock, eventtype,
1592                                                      action, arg,
1593                                                      sizeof(*ev));
1594
1595         if (ev == NULL)
1596                 return (NULL);
1597
1598         ev->result = ISC_R_UNSET;
1599         ISC_LINK_INIT(ev, ev_link);
1600         ISC_LIST_INIT(ev->bufferlist);
1601         ev->region.base = NULL;
1602         ev->n = 0;
1603         ev->offset = 0;
1604         ev->attributes = 0;
1605         ev->destroy = ev->ev_destroy;
1606         ev->ev_destroy = destroy_socketevent;
1607
1608         return (ev);
1609 }
1610
1611 #if defined(ISC_SOCKET_DEBUG)
1612 static void
1613 dump_msg(struct msghdr *msg) {
1614         unsigned int i;
1615
1616         printf("MSGHDR %p\n", msg);
1617         printf("\tname %p, namelen %ld\n", msg->msg_name,
1618                (long) msg->msg_namelen);
1619         printf("\tiov %p, iovlen %ld\n", msg->msg_iov,
1620                (long) msg->msg_iovlen);
1621         for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1622                 printf("\t\t%d\tbase %p, len %ld\n", i,
1623                        msg->msg_iov[i].iov_base,
1624                        (long) msg->msg_iov[i].iov_len);
1625 #ifdef ISC_NET_BSD44MSGHDR
1626         printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1627                (long) msg->msg_controllen);
1628 #endif
1629 }
1630 #endif
1631
1632 #define DOIO_SUCCESS            0       /* i/o ok, event sent */
1633 #define DOIO_SOFT               1       /* i/o ok, soft error, no event sent */
1634 #define DOIO_HARD               2       /* i/o error, event sent */
1635 #define DOIO_EOF                3       /* EOF, no event sent */
1636
1637 static int
1638 doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
1639         int cc;
1640         struct iovec iov[MAXSCATTERGATHER_RECV];
1641         size_t read_count;
1642         size_t actual_count;
1643         struct msghdr msghdr;
1644         isc_buffer_t *buffer;
1645         int recv_errno;
1646         char strbuf[ISC_STRERRORSIZE];
1647
1648         build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
1649
1650 #if defined(ISC_SOCKET_DEBUG)
1651         dump_msg(&msghdr);
1652 #endif
1653
1654         cc = recvmsg(sock->fd, &msghdr, 0);
1655         recv_errno = errno;
1656
1657 #if defined(ISC_SOCKET_DEBUG)
1658         dump_msg(&msghdr);
1659 #endif
1660
1661         if (cc < 0) {
1662                 if (SOFT_ERROR(recv_errno))
1663                         return (DOIO_SOFT);
1664
1665                 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1666                         isc__strerror(recv_errno, strbuf, sizeof(strbuf));
1667                         socket_log(sock, NULL, IOEVENT,
1668                                    isc_msgcat, ISC_MSGSET_SOCKET,
1669                                    ISC_MSG_DOIORECV,
1670                                   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1671                                    sock->fd, cc, recv_errno, strbuf);
1672                 }
1673
1674 #define SOFT_OR_HARD(_system, _isc) \
1675         if (recv_errno == _system) { \
1676                 if (sock->connected) { \
1677                         dev->result = _isc; \
1678                         inc_stats(sock->manager->stats, \
1679                                   sock->statsindex[STATID_RECVFAIL]); \
1680                         return (DOIO_HARD); \
1681                 } \
1682                 return (DOIO_SOFT); \
1683         }
1684 #define ALWAYS_HARD(_system, _isc) \
1685         if (recv_errno == _system) { \
1686                 dev->result = _isc; \
1687                 inc_stats(sock->manager->stats, \
1688                           sock->statsindex[STATID_RECVFAIL]); \
1689                 return (DOIO_HARD); \
1690         }
1691
1692                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1693                 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1694                 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1695                 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1696                 /* HPUX 11.11 can return EADDRNOTAVAIL. */
1697                 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1698                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1699                 /*
1700                  * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
1701                  * errors.
1702                  */
1703 #ifdef EPROTO
1704                 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1705 #endif
1706                 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1707
1708 #undef SOFT_OR_HARD
1709 #undef ALWAYS_HARD
1710
1711                 dev->result = isc__errno2result(recv_errno);
1712                 inc_stats(sock->manager->stats,
1713                           sock->statsindex[STATID_RECVFAIL]);
1714                 return (DOIO_HARD);
1715         }
1716
1717         /*
1718          * On TCP and UNIX sockets, zero length reads indicate EOF,
1719          * while on UDP sockets, zero length reads are perfectly valid,
1720          * although strange.
1721          */
1722         switch (sock->type) {
1723         case isc_sockettype_tcp:
1724         case isc_sockettype_unix:
1725                 if (cc == 0)
1726                         return (DOIO_EOF);
1727                 break;
1728         case isc_sockettype_udp:
1729                 break;
1730         case isc_sockettype_fdwatch:
1731         default:
1732                 INSIST(0);
1733         }
1734
1735         if (sock->type == isc_sockettype_udp) {
1736                 dev->address.length = msghdr.msg_namelen;
1737                 if (isc_sockaddr_getport(&dev->address) == 0) {
1738                         if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1739                                 socket_log(sock, &dev->address, IOEVENT,
1740                                            isc_msgcat, ISC_MSGSET_SOCKET,
1741                                            ISC_MSG_ZEROPORT,
1742                                            "dropping source port zero packet");
1743                         }
1744                         return (DOIO_SOFT);
1745                 }
1746                 /*
1747                  * Simulate a firewall blocking UDP responses bigger than
1748                  * 512 bytes.
1749                  */
1750                 if (sock->manager->maxudp != 0 && cc > sock->manager->maxudp)
1751                         return (DOIO_SOFT);
1752         }
1753
1754         socket_log(sock, &dev->address, IOEVENT,
1755                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1756                    "packet received correctly");
1757
1758         /*
1759          * Overflow bit detection.  If we received MORE bytes than we should,
1760          * this indicates an overflow situation.  Set the flag in the
1761          * dev entry and adjust how much we read by one.
1762          */
1763 #ifdef ISC_NET_RECVOVERFLOW
1764         if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1765                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1766                 cc--;
1767         }
1768 #endif
1769
1770         /*
1771          * If there are control messages attached, run through them and pull
1772          * out the interesting bits.
1773          */
1774         if (sock->type == isc_sockettype_udp)
1775                 process_cmsg(sock, &msghdr, dev);
1776
1777         /*
1778          * update the buffers (if any) and the i/o count
1779          */
1780         dev->n += cc;
1781         actual_count = cc;
1782         buffer = ISC_LIST_HEAD(dev->bufferlist);
1783         while (buffer != NULL && actual_count > 0U) {
1784                 REQUIRE(ISC_BUFFER_VALID(buffer));
1785                 if (isc_buffer_availablelength(buffer) <= actual_count) {
1786                         actual_count -= isc_buffer_availablelength(buffer);
1787                         isc_buffer_add(buffer,
1788                                        isc_buffer_availablelength(buffer));
1789                 } else {
1790                         isc_buffer_add(buffer, actual_count);
1791                         actual_count = 0;
1792                         POST(actual_count);
1793                         break;
1794                 }
1795                 buffer = ISC_LIST_NEXT(buffer, link);
1796                 if (buffer == NULL) {
1797                         INSIST(actual_count == 0U);
1798                 }
1799         }
1800
1801         /*
1802          * If we read less than we expected, update counters,
1803          * and let the upper layer poke the descriptor.
1804          */
1805         if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1806                 return (DOIO_SOFT);
1807
1808         /*
1809          * Full reads are posted, or partials if partials are ok.
1810          */
1811         dev->result = ISC_R_SUCCESS;
1812         return (DOIO_SUCCESS);
1813 }
1814
1815 /*
1816  * Returns:
1817  *      DOIO_SUCCESS    The operation succeeded.  dev->result contains
1818  *                      ISC_R_SUCCESS.
1819  *
1820  *      DOIO_HARD       A hard or unexpected I/O error was encountered.
1821  *                      dev->result contains the appropriate error.
1822  *
1823  *      DOIO_SOFT       A soft I/O error was encountered.  No senddone
1824  *                      event was sent.  The operation should be retried.
1825  *
1826  *      No other return values are possible.
1827  */
1828 static int
1829 doio_send(isc__socket_t *sock, isc_socketevent_t *dev) {
1830         int cc;
1831         struct iovec iov[MAXSCATTERGATHER_SEND];
1832         size_t write_count;
1833         struct msghdr msghdr;
1834         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1835         int attempts = 0;
1836         int send_errno;
1837         char strbuf[ISC_STRERRORSIZE];
1838
1839         build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1840
1841  resend:
1842         cc = sendmsg(sock->fd, &msghdr, 0);
1843         send_errno = errno;
1844
1845         /*
1846          * Check for error or block condition.
1847          */
1848         if (cc < 0) {
1849                 if (send_errno == EINTR && ++attempts < NRETRIES)
1850                         goto resend;
1851
1852                 if (SOFT_ERROR(send_errno))
1853                         return (DOIO_SOFT);
1854
1855 #define SOFT_OR_HARD(_system, _isc) \
1856         if (send_errno == _system) { \
1857                 if (sock->connected) { \
1858                         dev->result = _isc; \
1859                         inc_stats(sock->manager->stats, \
1860                                   sock->statsindex[STATID_SENDFAIL]); \
1861                         return (DOIO_HARD); \
1862                 } \
1863                 return (DOIO_SOFT); \
1864         }
1865 #define ALWAYS_HARD(_system, _isc) \
1866         if (send_errno == _system) { \
1867                 dev->result = _isc; \
1868                 inc_stats(sock->manager->stats, \
1869                           sock->statsindex[STATID_SENDFAIL]); \
1870                 return (DOIO_HARD); \
1871         }
1872
1873                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1874                 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1875                 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1876                 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1877                 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1878 #ifdef EHOSTDOWN
1879                 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1880 #endif
1881                 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1882                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1883                 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1884                 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1885                 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1886
1887 #undef SOFT_OR_HARD
1888 #undef ALWAYS_HARD
1889
1890                 /*
1891                  * The other error types depend on whether or not the
1892                  * socket is UDP or TCP.  If it is UDP, some errors
1893                  * that we expect to be fatal under TCP are merely
1894                  * annoying, and are really soft errors.
1895                  *
1896                  * However, these soft errors are still returned as
1897                  * a status.
1898                  */
1899                 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1900                 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1901                 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1902                                  addrbuf, strbuf);
1903                 dev->result = isc__errno2result(send_errno);
1904                 inc_stats(sock->manager->stats,
1905                           sock->statsindex[STATID_SENDFAIL]);
1906                 return (DOIO_HARD);
1907         }
1908
1909         if (cc == 0) {
1910                 inc_stats(sock->manager->stats,
1911                           sock->statsindex[STATID_SENDFAIL]);
1912                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1913                                  "doio_send: send() %s 0",
1914                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1915                                                 ISC_MSG_RETURNED, "returned"));
1916         }
1917
1918         /*
1919          * If we write less than we expected, update counters, poke.
1920          */
1921         dev->n += cc;
1922         if ((size_t)cc != write_count)
1923                 return (DOIO_SOFT);
1924
1925         /*
1926          * Exactly what we wanted to write.  We're done with this
1927          * entry.  Post its completion event.
1928          */
1929         dev->result = ISC_R_SUCCESS;
1930         return (DOIO_SUCCESS);
1931 }
1932
1933 /*
1934  * Kill.
1935  *
1936  * Caller must ensure that the socket is not locked and no external
1937  * references exist.
1938  */
1939 static void
1940 closesocket(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) {
1941         isc_sockettype_t type = sock->type;
1942         int lockid = FDLOCK_ID(fd);
1943
1944         /*
1945          * No one has this socket open, so the watcher doesn't have to be
1946          * poked, and the socket doesn't have to be locked.
1947          */
1948         LOCK(&manager->fdlock[lockid]);
1949         manager->fds[fd] = NULL;
1950         if (type == isc_sockettype_fdwatch)
1951                 manager->fdstate[fd] = CLOSED;
1952         else
1953                 manager->fdstate[fd] = CLOSE_PENDING;
1954         UNLOCK(&manager->fdlock[lockid]);
1955         if (type == isc_sockettype_fdwatch) {
1956                 /*
1957                  * The caller may close the socket once this function returns,
1958                  * and `fd' may be reassigned for a new socket.  So we do
1959                  * unwatch_fd() here, rather than defer it via select_poke().
1960                  * Note: this may complicate data protection among threads and
1961                  * may reduce performance due to additional locks.  One way to
1962                  * solve this would be to dup() the watched descriptor, but we
1963                  * take a simpler approach at this moment.
1964                  */
1965                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1966                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1967         } else
1968                 select_poke(manager, fd, SELECT_POKE_CLOSE);
1969
1970         inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]);
1971
1972         /*
1973          * update manager->maxfd here (XXX: this should be implemented more
1974          * efficiently)
1975          */
1976 #ifdef USE_SELECT
1977         LOCK(&manager->lock);
1978         if (manager->maxfd == fd) {
1979                 int i;
1980
1981                 manager->maxfd = 0;
1982                 for (i = fd - 1; i >= 0; i--) {
1983                         lockid = FDLOCK_ID(i);
1984
1985                         LOCK(&manager->fdlock[lockid]);
1986                         if (manager->fdstate[i] == MANAGED) {
1987                                 manager->maxfd = i;
1988                                 UNLOCK(&manager->fdlock[lockid]);
1989                                 break;
1990                         }
1991                         UNLOCK(&manager->fdlock[lockid]);
1992                 }
1993 #ifdef ISC_PLATFORM_USETHREADS
1994                 if (manager->maxfd < manager->pipe_fds[0])
1995                         manager->maxfd = manager->pipe_fds[0];
1996 #endif
1997         }
1998         UNLOCK(&manager->lock);
1999 #endif  /* USE_SELECT */
2000 }
2001
2002 static void
2003 destroy(isc__socket_t **sockp) {
2004         int fd;
2005         isc__socket_t *sock = *sockp;
2006         isc__socketmgr_t *manager = sock->manager;
2007
2008         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2009                    ISC_MSG_DESTROYING, "destroying");
2010
2011         INSIST(ISC_LIST_EMPTY(sock->accept_list));
2012         INSIST(ISC_LIST_EMPTY(sock->recv_list));
2013         INSIST(ISC_LIST_EMPTY(sock->send_list));
2014         INSIST(sock->connect_ev == NULL);
2015         REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks);
2016
2017         if (sock->fd >= 0) {
2018                 fd = sock->fd;
2019                 sock->fd = -1;
2020                 closesocket(manager, sock, fd);
2021         }
2022
2023         LOCK(&manager->lock);
2024
2025         ISC_LIST_UNLINK(manager->socklist, sock, link);
2026
2027 #ifdef USE_WATCHER_THREAD
2028         if (ISC_LIST_EMPTY(manager->socklist))
2029                 SIGNAL(&manager->shutdown_ok);
2030 #endif /* USE_WATCHER_THREAD */
2031
2032         /* can't unlock manager as its memory context is still used */
2033         free_socket(sockp);
2034
2035         UNLOCK(&manager->lock);
2036 }
2037
2038 static isc_result_t
2039 allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type,
2040                 isc__socket_t **socketp)
2041 {
2042         isc__socket_t *sock;
2043         isc_result_t result;
2044         ISC_SOCKADDR_LEN_T cmsgbuflen;
2045
2046         sock = isc_mem_get(manager->mctx, sizeof(*sock));
2047
2048         if (sock == NULL)
2049                 return (ISC_R_NOMEMORY);
2050
2051         sock->common.magic = 0;
2052         sock->common.impmagic = 0;
2053         sock->references = 0;
2054
2055         sock->manager = manager;
2056         sock->type = type;
2057         sock->fd = -1;
2058         sock->dupped = 0;
2059         sock->statsindex = NULL;
2060
2061         ISC_LINK_INIT(sock, link);
2062
2063         sock->recvcmsgbuf = NULL;
2064         sock->sendcmsgbuf = NULL;
2065
2066         /*
2067          * set up cmsg buffers
2068          */
2069         cmsgbuflen = 0;
2070 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
2071         cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo));
2072 #endif
2073 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
2074         cmsgbuflen += cmsg_space(sizeof(struct timeval));
2075 #endif
2076         sock->recvcmsgbuflen = cmsgbuflen;
2077         if (sock->recvcmsgbuflen != 0U) {
2078                 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
2079                 if (sock->recvcmsgbuf == NULL) {
2080                         result = ISC_R_NOMEMORY;
2081                         goto error;
2082                 }
2083         }
2084
2085         cmsgbuflen = 0;
2086 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
2087         cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo));
2088 #if defined(IPV6_USE_MIN_MTU)
2089         /*
2090          * Provide space for working around FreeBSD's broken IPV6_USE_MIN_MTU
2091          * support.
2092          */
2093         cmsgbuflen += cmsg_space(sizeof(int));
2094 #endif
2095 #endif
2096         sock->sendcmsgbuflen = cmsgbuflen;
2097         if (sock->sendcmsgbuflen != 0U) {
2098                 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
2099                 if (sock->sendcmsgbuf == NULL) {
2100                         result = ISC_R_NOMEMORY;
2101                         goto error;
2102                 }
2103         }
2104
2105         memset(sock->name, 0, sizeof(sock->name));
2106         sock->tag = NULL;
2107
2108         /*
2109          * set up list of readers and writers to be initially empty
2110          */
2111         ISC_LIST_INIT(sock->recv_list);
2112         ISC_LIST_INIT(sock->send_list);
2113         ISC_LIST_INIT(sock->accept_list);
2114         sock->connect_ev = NULL;
2115         sock->pending_recv = 0;
2116         sock->pending_send = 0;
2117         sock->pending_accept = 0;
2118         sock->listener = 0;
2119         sock->connected = 0;
2120         sock->connecting = 0;
2121         sock->bound = 0;
2122
2123         /*
2124          * initialize the lock
2125          */
2126         result = isc_mutex_init(&sock->lock);
2127         if (result != ISC_R_SUCCESS) {
2128                 sock->common.magic = 0;
2129                 sock->common.impmagic = 0;
2130                 goto error;
2131         }
2132
2133         /*
2134          * Initialize readable and writable events
2135          */
2136         ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
2137                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
2138                        NULL, sock, sock, NULL, NULL);
2139         ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
2140                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
2141                        NULL, sock, sock, NULL, NULL);
2142
2143         sock->common.magic = ISCAPI_SOCKET_MAGIC;
2144         sock->common.impmagic = SOCKET_MAGIC;
2145         *socketp = sock;
2146
2147         return (ISC_R_SUCCESS);
2148
2149  error:
2150         if (sock->recvcmsgbuf != NULL)
2151                 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
2152                             sock->recvcmsgbuflen);
2153         if (sock->sendcmsgbuf != NULL)
2154                 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
2155                             sock->sendcmsgbuflen);
2156         isc_mem_put(manager->mctx, sock, sizeof(*sock));
2157
2158         return (result);
2159 }
2160
2161 /*
2162  * This event requires that the various lists be empty, that the reference
2163  * count be 1, and that the magic number is valid.  The other socket bits,
2164  * like the lock, must be initialized as well.  The fd associated must be
2165  * marked as closed, by setting it to -1 on close, or this routine will
2166  * also close the socket.
2167  */
2168 static void
2169 free_socket(isc__socket_t **socketp) {
2170         isc__socket_t *sock = *socketp;
2171
2172         INSIST(sock->references == 0);
2173         INSIST(VALID_SOCKET(sock));
2174         INSIST(!sock->connecting);
2175         INSIST(!sock->pending_recv);
2176         INSIST(!sock->pending_send);
2177         INSIST(!sock->pending_accept);
2178         INSIST(ISC_LIST_EMPTY(sock->recv_list));
2179         INSIST(ISC_LIST_EMPTY(sock->send_list));
2180         INSIST(ISC_LIST_EMPTY(sock->accept_list));
2181         INSIST(!ISC_LINK_LINKED(sock, link));
2182
2183         if (sock->recvcmsgbuf != NULL)
2184                 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
2185                             sock->recvcmsgbuflen);
2186         if (sock->sendcmsgbuf != NULL)
2187                 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
2188                             sock->sendcmsgbuflen);
2189
2190         sock->common.magic = 0;
2191         sock->common.impmagic = 0;
2192
2193         DESTROYLOCK(&sock->lock);
2194
2195         isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
2196
2197         *socketp = NULL;
2198 }
2199
2200 #ifdef SO_BSDCOMPAT
2201 /*
2202  * This really should not be necessary to do.  Having to workout
2203  * which kernel version we are on at run time so that we don't cause
2204  * the kernel to issue a warning about us using a deprecated socket option.
2205  * Such warnings should *never* be on by default in production kernels.
2206  *
2207  * We can't do this a build time because executables are moved between
2208  * machines and hence kernels.
2209  *
2210  * We can't just not set SO_BSDCOMAT because some kernels require it.
2211  */
2212
2213 static isc_once_t         bsdcompat_once = ISC_ONCE_INIT;
2214 isc_boolean_t bsdcompat = ISC_TRUE;
2215
2216 static void
2217 clear_bsdcompat(void) {
2218 #ifdef __linux__
2219          struct utsname buf;
2220          char *endp;
2221          long int major;
2222          long int minor;
2223
2224          uname(&buf);    /* Can only fail if buf is bad in Linux. */
2225
2226          /* Paranoia in parsing can be increased, but we trust uname(). */
2227          major = strtol(buf.release, &endp, 10);
2228          if (*endp == '.') {
2229                 minor = strtol(endp+1, &endp, 10);
2230                 if ((major > 2) || ((major == 2) && (minor >= 4))) {
2231                         bsdcompat = ISC_FALSE;
2232                 }
2233          }
2234 #endif /* __linux __ */
2235 }
2236 #endif
2237
2238 static isc_result_t
2239 opensocket(isc__socketmgr_t *manager, isc__socket_t *sock,
2240            isc__socket_t *dup_socket)
2241 {
2242         isc_result_t result;
2243         char strbuf[ISC_STRERRORSIZE];
2244         const char *err = "socket";
2245         int tries = 0;
2246 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
2247         int on = 1;
2248 #endif
2249 #if defined(SO_RCVBUF)
2250         ISC_SOCKADDR_LEN_T optlen;
2251         int size;
2252 #endif
2253
2254  again:
2255         if (dup_socket == NULL) {
2256                 switch (sock->type) {
2257                 case isc_sockettype_udp:
2258                         sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2259                         break;
2260                 case isc_sockettype_tcp:
2261                         sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2262                         break;
2263                 case isc_sockettype_unix:
2264                         sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2265                         break;
2266                 case isc_sockettype_fdwatch:
2267                         /*
2268                          * We should not be called for isc_sockettype_fdwatch
2269                          * sockets.
2270                          */
2271                         INSIST(0);
2272                         break;
2273                 }
2274         } else {
2275                 sock->fd = dup(dup_socket->fd);
2276                 sock->dupped = 1;
2277                 sock->bound = dup_socket->bound;
2278         }
2279         if (sock->fd == -1 && errno == EINTR && tries++ < 42)
2280                 goto again;
2281
2282 #ifdef F_DUPFD
2283         /*
2284          * Leave a space for stdio and TCP to work in.
2285          */
2286         if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2287             sock->fd >= 0 && sock->fd < manager->reserved) {
2288                 int new, tmp;
2289                 new = fcntl(sock->fd, F_DUPFD, manager->reserved);
2290                 tmp = errno;
2291                 (void)close(sock->fd);
2292                 errno = tmp;
2293                 sock->fd = new;
2294                 err = "isc_socket_create: fcntl/reserved";
2295         } else if (sock->fd >= 0 && sock->fd < 20) {
2296                 int new, tmp;
2297                 new = fcntl(sock->fd, F_DUPFD, 20);
2298                 tmp = errno;
2299                 (void)close(sock->fd);
2300                 errno = tmp;
2301                 sock->fd = new;
2302                 err = "isc_socket_create: fcntl";
2303         }
2304 #endif
2305
2306         if (sock->fd >= (int)manager->maxsocks) {
2307                 (void)close(sock->fd);
2308                 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2309                                ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2310                                isc_msgcat, ISC_MSGSET_SOCKET,
2311                                ISC_MSG_TOOMANYFDS,
2312                                "socket: file descriptor exceeds limit (%d/%u)",
2313                                sock->fd, manager->maxsocks);
2314                 return (ISC_R_NORESOURCES);
2315         }
2316
2317         if (sock->fd < 0) {
2318                 switch (errno) {
2319                 case EMFILE:
2320                 case ENFILE:
2321                         isc__strerror(errno, strbuf, sizeof(strbuf));
2322                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2323                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2324                                        isc_msgcat, ISC_MSGSET_SOCKET,
2325                                        ISC_MSG_TOOMANYFDS,
2326                                        "%s: %s", err, strbuf);
2327                         /* fallthrough */
2328                 case ENOBUFS:
2329                         return (ISC_R_NORESOURCES);
2330
2331                 case EPROTONOSUPPORT:
2332                 case EPFNOSUPPORT:
2333                 case EAFNOSUPPORT:
2334                 /*
2335                  * Linux 2.2 (and maybe others) return EINVAL instead of
2336                  * EAFNOSUPPORT.
2337                  */
2338                 case EINVAL:
2339                         return (ISC_R_FAMILYNOSUPPORT);
2340
2341                 default:
2342                         isc__strerror(errno, strbuf, sizeof(strbuf));
2343                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2344                                          "%s() %s: %s", err,
2345                                          isc_msgcat_get(isc_msgcat,
2346                                                         ISC_MSGSET_GENERAL,
2347                                                         ISC_MSG_FAILED,
2348                                                         "failed"),
2349                                          strbuf);
2350                         return (ISC_R_UNEXPECTED);
2351                 }
2352         }
2353
2354         if (dup_socket != NULL)
2355                 goto setup_done;
2356
2357         result = make_nonblock(sock->fd);
2358         if (result != ISC_R_SUCCESS) {
2359                 (void)close(sock->fd);
2360                 return (result);
2361         }
2362
2363 #ifdef SO_BSDCOMPAT
2364         RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
2365                                   clear_bsdcompat) == ISC_R_SUCCESS);
2366         if (sock->type != isc_sockettype_unix && bsdcompat &&
2367             setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
2368                        (void *)&on, sizeof(on)) < 0) {
2369                 isc__strerror(errno, strbuf, sizeof(strbuf));
2370                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2371                                  "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
2372                                  sock->fd,
2373                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2374                                                 ISC_MSG_FAILED, "failed"),
2375                                  strbuf);
2376                 /* Press on... */
2377         }
2378 #endif
2379
2380 #ifdef SO_NOSIGPIPE
2381         if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE,
2382                        (void *)&on, sizeof(on)) < 0) {
2383                 isc__strerror(errno, strbuf, sizeof(strbuf));
2384                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2385                                  "setsockopt(%d, SO_NOSIGPIPE) %s: %s",
2386                                  sock->fd,
2387                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2388                                                 ISC_MSG_FAILED, "failed"),
2389                                  strbuf);
2390                 /* Press on... */
2391         }
2392 #endif
2393
2394 #if defined(USE_CMSG) || defined(SO_RCVBUF)
2395         if (sock->type == isc_sockettype_udp) {
2396
2397 #if defined(USE_CMSG)
2398 #if defined(SO_TIMESTAMP)
2399                 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
2400                                (void *)&on, sizeof(on)) < 0
2401                     && errno != ENOPROTOOPT) {
2402                         isc__strerror(errno, strbuf, sizeof(strbuf));
2403                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2404                                          "setsockopt(%d, SO_TIMESTAMP) %s: %s",
2405                                          sock->fd,
2406                                          isc_msgcat_get(isc_msgcat,
2407                                                         ISC_MSGSET_GENERAL,
2408                                                         ISC_MSG_FAILED,
2409                                                         "failed"),
2410                                          strbuf);
2411                         /* Press on... */
2412                 }
2413 #endif /* SO_TIMESTAMP */
2414
2415 #if defined(ISC_PLATFORM_HAVEIPV6)
2416                 if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
2417                         /*
2418                          * Warn explicitly because this anomaly can be hidden
2419                          * in usual operation (and unexpectedly appear later).
2420                          */
2421                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2422                                          "No buffer available to receive "
2423                                          "IPv6 destination");
2424                 }
2425 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
2426 #ifdef IPV6_RECVPKTINFO
2427                 /* RFC 3542 */
2428                 if ((sock->pf == AF_INET6)
2429                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2430                                    (void *)&on, sizeof(on)) < 0)) {
2431                         isc__strerror(errno, strbuf, sizeof(strbuf));
2432                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2433                                          "setsockopt(%d, IPV6_RECVPKTINFO) "
2434                                          "%s: %s", sock->fd,
2435                                          isc_msgcat_get(isc_msgcat,
2436                                                         ISC_MSGSET_GENERAL,
2437                                                         ISC_MSG_FAILED,
2438                                                         "failed"),
2439                                          strbuf);
2440                 }
2441 #else
2442                 /* RFC 2292 */
2443                 if ((sock->pf == AF_INET6)
2444                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2445                                    (void *)&on, sizeof(on)) < 0)) {
2446                         isc__strerror(errno, strbuf, sizeof(strbuf));
2447                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2448                                          "setsockopt(%d, IPV6_PKTINFO) %s: %s",
2449                                          sock->fd,
2450                                          isc_msgcat_get(isc_msgcat,
2451                                                         ISC_MSGSET_GENERAL,
2452                                                         ISC_MSG_FAILED,
2453                                                         "failed"),
2454                                          strbuf);
2455                 }
2456 #endif /* IPV6_RECVPKTINFO */
2457 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
2458 #ifdef IPV6_USE_MIN_MTU        /* RFC 3542, not too common yet*/
2459                 /* use minimum MTU */
2460                 if (sock->pf == AF_INET6 &&
2461                     setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
2462                                (void *)&on, sizeof(on)) < 0) {
2463                         isc__strerror(errno, strbuf, sizeof(strbuf));
2464                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2465                                          "setsockopt(%d, IPV6_USE_MIN_MTU) "
2466                                          "%s: %s", sock->fd,
2467                                          isc_msgcat_get(isc_msgcat,
2468                                                         ISC_MSGSET_GENERAL,
2469                                                         ISC_MSG_FAILED,
2470                                                         "failed"),
2471                                          strbuf);
2472                 }
2473 #endif
2474 #if defined(IPV6_MTU)
2475                 /*
2476                  * Use minimum MTU on IPv6 sockets.
2477                  */
2478                 if (sock->pf == AF_INET6) {
2479                         int mtu = 1280;
2480                         (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU,
2481                                          &mtu, sizeof(mtu));
2482                 }
2483 #endif
2484 #if defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DONT)
2485                 /*
2486                  * Turn off Path MTU discovery on IPv6/UDP sockets.
2487                  */
2488                 if (sock->pf == AF_INET6) {
2489                         int action = IPV6_PMTUDISC_DONT;
2490                         (void)setsockopt(sock->fd, IPPROTO_IPV6,
2491                                          IPV6_MTU_DISCOVER, &action,
2492                                          sizeof(action));
2493                 }
2494 #endif
2495 #endif /* ISC_PLATFORM_HAVEIPV6 */
2496 #endif /* defined(USE_CMSG) */
2497
2498 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
2499                 /*
2500                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2501                  */
2502                 if (sock->pf == AF_INET) {
2503                         int action = IP_PMTUDISC_DONT;
2504                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2505                                          &action, sizeof(action));
2506                 }
2507 #endif
2508 #if defined(IP_DONTFRAG)
2509                 /*
2510                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2511                  */
2512                 if (sock->pf == AF_INET) {
2513                         int off = 0;
2514                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG,
2515                                          &off, sizeof(off));
2516                 }
2517 #endif
2518
2519 #if defined(SO_RCVBUF)
2520                 optlen = sizeof(size);
2521                 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2522                                (void *)&size, &optlen) >= 0 &&
2523                      size < RCVBUFSIZE) {
2524                         size = RCVBUFSIZE;
2525                         if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2526                                        (void *)&size, sizeof(size)) == -1) {
2527                                 isc__strerror(errno, strbuf, sizeof(strbuf));
2528                                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2529                                         "setsockopt(%d, SO_RCVBUF, %d) %s: %s",
2530                                         sock->fd, size,
2531                                         isc_msgcat_get(isc_msgcat,
2532                                                        ISC_MSGSET_GENERAL,
2533                                                        ISC_MSG_FAILED,
2534                                                        "failed"),
2535                                         strbuf);
2536                         }
2537                 }
2538 #endif
2539         }
2540 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
2541
2542 setup_done:
2543         inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2544
2545         return (ISC_R_SUCCESS);
2546 }
2547
2548 /*
2549  * Create a 'type' socket or duplicate an existing socket, managed
2550  * by 'manager'.  Events will be posted to 'task' and when dispatched
2551  * 'action' will be called with 'arg' as the arg value.  The new
2552  * socket is returned in 'socketp'.
2553  */
2554 static isc_result_t
2555 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
2556               isc_socket_t **socketp, isc_socket_t *dup_socket)
2557 {
2558         isc__socket_t *sock = NULL;
2559         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2560         isc_result_t result;
2561         int lockid;
2562
2563         REQUIRE(VALID_MANAGER(manager));
2564         REQUIRE(socketp != NULL && *socketp == NULL);
2565         REQUIRE(type != isc_sockettype_fdwatch);
2566
2567         result = allocate_socket(manager, type, &sock);
2568         if (result != ISC_R_SUCCESS)
2569                 return (result);
2570
2571         switch (sock->type) {
2572         case isc_sockettype_udp:
2573                 sock->statsindex =
2574                         (pf == AF_INET) ? upd4statsindex : upd6statsindex;
2575                 break;
2576         case isc_sockettype_tcp:
2577                 sock->statsindex =
2578                         (pf == AF_INET) ? tcp4statsindex : tcp6statsindex;
2579                 break;
2580         case isc_sockettype_unix:
2581                 sock->statsindex = unixstatsindex;
2582                 break;
2583         default:
2584                 INSIST(0);
2585         }
2586
2587         sock->pf = pf;
2588
2589         result = opensocket(manager, sock, (isc__socket_t *)dup_socket);
2590         if (result != ISC_R_SUCCESS) {
2591                 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2592                 free_socket(&sock);
2593                 return (result);
2594         }
2595
2596         sock->common.methods = (isc_socketmethods_t *)&socketmethods;
2597         sock->references = 1;
2598         *socketp = (isc_socket_t *)sock;
2599
2600         /*
2601          * Note we don't have to lock the socket like we normally would because
2602          * there are no external references to it yet.
2603          */
2604
2605         lockid = FDLOCK_ID(sock->fd);
2606         LOCK(&manager->fdlock[lockid]);
2607         manager->fds[sock->fd] = sock;
2608         manager->fdstate[sock->fd] = MANAGED;
2609 #ifdef USE_DEVPOLL
2610         INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2611                sock->manager->fdpollinfo[sock->fd].want_write == 0);
2612 #endif
2613         UNLOCK(&manager->fdlock[lockid]);
2614
2615         LOCK(&manager->lock);
2616         ISC_LIST_APPEND(manager->socklist, sock, link);
2617 #ifdef USE_SELECT
2618         if (manager->maxfd < sock->fd)
2619                 manager->maxfd = sock->fd;
2620 #endif
2621         UNLOCK(&manager->lock);
2622
2623         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2624                    ISC_MSG_CREATED, dup_socket == NULL ? "dupped" : "created");
2625
2626         return (ISC_R_SUCCESS);
2627 }
2628
2629 /*%
2630  * Create a new 'type' socket managed by 'manager'.  Events
2631  * will be posted to 'task' and when dispatched 'action' will be
2632  * called with 'arg' as the arg value.  The new socket is returned
2633  * in 'socketp'.
2634  */
2635 ISC_SOCKETFUNC_SCOPE isc_result_t
2636 isc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
2637                    isc_socket_t **socketp)
2638 {
2639         return (socket_create(manager0, pf, type, socketp, NULL));
2640 }
2641
2642 /*%
2643  * Duplicate an existing socket.  The new socket is returned
2644  * in 'socketp'.
2645  */
2646 ISC_SOCKETFUNC_SCOPE isc_result_t
2647 isc__socket_dup(isc_socket_t *sock0, isc_socket_t **socketp) {
2648         isc__socket_t *sock = (isc__socket_t *)sock0;
2649
2650         REQUIRE(VALID_SOCKET(sock));
2651         REQUIRE(socketp != NULL && *socketp == NULL);
2652
2653         return (socket_create((isc_socketmgr_t *) sock->manager,
2654                               sock->pf, sock->type, socketp,
2655                               sock0));
2656 }
2657
2658 #ifdef BIND9
2659 ISC_SOCKETFUNC_SCOPE isc_result_t
2660 isc__socket_open(isc_socket_t *sock0) {
2661         isc_result_t result;
2662         isc__socket_t *sock = (isc__socket_t *)sock0;
2663
2664         REQUIRE(VALID_SOCKET(sock));
2665
2666         LOCK(&sock->lock);
2667         REQUIRE(sock->references == 1);
2668         REQUIRE(sock->type != isc_sockettype_fdwatch);
2669         UNLOCK(&sock->lock);
2670         /*
2671          * We don't need to retain the lock hereafter, since no one else has
2672          * this socket.
2673          */
2674         REQUIRE(sock->fd == -1);
2675
2676         result = opensocket(sock->manager, sock, NULL);
2677         if (result != ISC_R_SUCCESS)
2678                 sock->fd = -1;
2679
2680         if (result == ISC_R_SUCCESS) {
2681                 int lockid = FDLOCK_ID(sock->fd);
2682
2683                 LOCK(&sock->manager->fdlock[lockid]);
2684                 sock->manager->fds[sock->fd] = sock;
2685                 sock->manager->fdstate[sock->fd] = MANAGED;
2686 #ifdef USE_DEVPOLL
2687                 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2688                        sock->manager->fdpollinfo[sock->fd].want_write == 0);
2689 #endif
2690                 UNLOCK(&sock->manager->fdlock[lockid]);
2691
2692 #ifdef USE_SELECT
2693                 LOCK(&sock->manager->lock);
2694                 if (sock->manager->maxfd < sock->fd)
2695                         sock->manager->maxfd = sock->fd;
2696                 UNLOCK(&sock->manager->lock);
2697 #endif
2698         }
2699
2700         return (result);
2701 }
2702 #endif  /* BIND9 */
2703
2704 /*
2705  * Create a new 'type' socket managed by 'manager'.  Events
2706  * will be posted to 'task' and when dispatched 'action' will be
2707  * called with 'arg' as the arg value.  The new socket is returned
2708  * in 'socketp'.
2709  */
2710 ISC_SOCKETFUNC_SCOPE isc_result_t
2711 isc__socket_fdwatchcreate(isc_socketmgr_t *manager0, int fd, int flags,
2712                           isc_sockfdwatch_t callback, void *cbarg,
2713                           isc_task_t *task, isc_socket_t **socketp)
2714 {
2715         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2716         isc__socket_t *sock = NULL;
2717         isc_result_t result;
2718         int lockid;
2719
2720         REQUIRE(VALID_MANAGER(manager));
2721         REQUIRE(socketp != NULL && *socketp == NULL);
2722
2723         result = allocate_socket(manager, isc_sockettype_fdwatch, &sock);
2724         if (result != ISC_R_SUCCESS)
2725                 return (result);
2726
2727         sock->fd = fd;
2728         sock->fdwatcharg = cbarg;
2729         sock->fdwatchcb = callback;
2730         sock->fdwatchflags = flags;
2731         sock->fdwatchtask = task;
2732         sock->statsindex = fdwatchstatsindex;
2733
2734         sock->common.methods = (isc_socketmethods_t *)&socketmethods;
2735         sock->references = 1;
2736         *socketp = (isc_socket_t *)sock;
2737
2738         /*
2739          * Note we don't have to lock the socket like we normally would because
2740          * there are no external references to it yet.
2741          */
2742
2743         lockid = FDLOCK_ID(sock->fd);
2744         LOCK(&manager->fdlock[lockid]);
2745         manager->fds[sock->fd] = sock;
2746         manager->fdstate[sock->fd] = MANAGED;
2747         UNLOCK(&manager->fdlock[lockid]);
2748
2749         LOCK(&manager->lock);
2750         ISC_LIST_APPEND(manager->socklist, sock, link);
2751 #ifdef USE_SELECT
2752         if (manager->maxfd < sock->fd)
2753                 manager->maxfd = sock->fd;
2754 #endif
2755         UNLOCK(&manager->lock);
2756
2757         if (flags & ISC_SOCKFDWATCH_READ)
2758                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2759         if (flags & ISC_SOCKFDWATCH_WRITE)
2760                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2761
2762         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2763                    ISC_MSG_CREATED, "fdwatch-created");
2764
2765         return (ISC_R_SUCCESS);
2766 }
2767
2768 /*
2769  * Indicate to the manager that it should watch the socket again.
2770  * This can be used to restart watching if the previous event handler
2771  * didn't indicate there was more data to be processed.  Primarily
2772  * it is for writing but could be used for reading if desired
2773  */
2774
2775 ISC_SOCKETFUNC_SCOPE isc_result_t
2776 isc__socket_fdwatchpoke(isc_socket_t *sock0, int flags)
2777 {
2778         isc__socket_t *sock = (isc__socket_t *)sock0;
2779
2780         REQUIRE(VALID_SOCKET(sock));
2781
2782         /*
2783          * We check both flags first to allow us to get the lock
2784          * once but only if we need it.
2785          */
2786
2787         if ((flags & (ISC_SOCKFDWATCH_READ | ISC_SOCKFDWATCH_WRITE)) != 0) {
2788                 LOCK(&sock->lock);
2789                 if (((flags & ISC_SOCKFDWATCH_READ) != 0) &&
2790                     !sock->pending_recv)
2791                         select_poke(sock->manager, sock->fd,
2792                                     SELECT_POKE_READ);
2793                 if (((flags & ISC_SOCKFDWATCH_WRITE) != 0) &&
2794                     !sock->pending_send)
2795                         select_poke(sock->manager, sock->fd,
2796                                     SELECT_POKE_WRITE);
2797                 UNLOCK(&sock->lock);
2798         }
2799
2800         socket_log(sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET,
2801                    ISC_MSG_POKED, "fdwatch-poked flags: %d", flags);
2802
2803         return (ISC_R_SUCCESS);
2804 }
2805
2806 /*
2807  * Attach to a socket.  Caller must explicitly detach when it is done.
2808  */
2809 ISC_SOCKETFUNC_SCOPE void
2810 isc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
2811         isc__socket_t *sock = (isc__socket_t *)sock0;
2812
2813         REQUIRE(VALID_SOCKET(sock));
2814         REQUIRE(socketp != NULL && *socketp == NULL);
2815
2816         LOCK(&sock->lock);
2817         sock->references++;
2818         UNLOCK(&sock->lock);
2819
2820         *socketp = (isc_socket_t *)sock;
2821 }
2822
2823 /*
2824  * Dereference a socket.  If this is the last reference to it, clean things
2825  * up by destroying the socket.
2826  */
2827 ISC_SOCKETFUNC_SCOPE void
2828 isc__socket_detach(isc_socket_t **socketp) {
2829         isc__socket_t *sock;
2830         isc_boolean_t kill_socket = ISC_FALSE;
2831
2832         REQUIRE(socketp != NULL);
2833         sock = (isc__socket_t *)*socketp;
2834         REQUIRE(VALID_SOCKET(sock));
2835
2836         LOCK(&sock->lock);
2837         REQUIRE(sock->references > 0);
2838         sock->references--;
2839         if (sock->references == 0)
2840                 kill_socket = ISC_TRUE;
2841         UNLOCK(&sock->lock);
2842
2843         if (kill_socket)
2844                 destroy(&sock);
2845
2846         *socketp = NULL;
2847 }
2848
2849 #ifdef BIND9
2850 ISC_SOCKETFUNC_SCOPE isc_result_t
2851 isc__socket_close(isc_socket_t *sock0) {
2852         isc__socket_t *sock = (isc__socket_t *)sock0;
2853         int fd;
2854         isc__socketmgr_t *manager;
2855
2856         fflush(stdout);
2857         REQUIRE(VALID_SOCKET(sock));
2858
2859         LOCK(&sock->lock);
2860
2861         REQUIRE(sock->references == 1);
2862         REQUIRE(sock->type != isc_sockettype_fdwatch);
2863         REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2864
2865         INSIST(!sock->connecting);
2866         INSIST(!sock->pending_recv);
2867         INSIST(!sock->pending_send);
2868         INSIST(!sock->pending_accept);
2869         INSIST(ISC_LIST_EMPTY(sock->recv_list));
2870         INSIST(ISC_LIST_EMPTY(sock->send_list));
2871         INSIST(ISC_LIST_EMPTY(sock->accept_list));
2872         INSIST(sock->connect_ev == NULL);
2873
2874         manager = sock->manager;
2875         fd = sock->fd;
2876         sock->fd = -1;
2877         sock->dupped = 0;
2878         memset(sock->name, 0, sizeof(sock->name));
2879         sock->tag = NULL;
2880         sock->listener = 0;
2881         sock->connected = 0;
2882         sock->connecting = 0;
2883         sock->bound = 0;
2884         isc_sockaddr_any(&sock->peer_address);
2885
2886         UNLOCK(&sock->lock);
2887
2888         closesocket(manager, sock, fd);
2889
2890         return (ISC_R_SUCCESS);
2891 }
2892 #endif  /* BIND9 */
2893
2894 /*
2895  * I/O is possible on a given socket.  Schedule an event to this task that
2896  * will call an internal function to do the I/O.  This will charge the
2897  * task with the I/O operation and let our select loop handler get back
2898  * to doing something real as fast as possible.
2899  *
2900  * The socket and manager must be locked before calling this function.
2901  */
2902 static void
2903 dispatch_recv(isc__socket_t *sock) {
2904         intev_t *iev;
2905         isc_socketevent_t *ev;
2906         isc_task_t *sender;
2907
2908         INSIST(!sock->pending_recv);
2909
2910         if (sock->type != isc_sockettype_fdwatch) {
2911                 ev = ISC_LIST_HEAD(sock->recv_list);
2912                 if (ev == NULL)
2913                         return;
2914                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2915                            "dispatch_recv:  event %p -> task %p",
2916                            ev, ev->ev_sender);
2917                 sender = ev->ev_sender;
2918         } else {
2919                 sender = sock->fdwatchtask;
2920         }
2921
2922         sock->pending_recv = 1;
2923         iev = &sock->readable_ev;
2924
2925         sock->references++;
2926         iev->ev_sender = sock;
2927         if (sock->type == isc_sockettype_fdwatch)
2928                 iev->ev_action = internal_fdwatch_read;
2929         else
2930                 iev->ev_action = internal_recv;
2931         iev->ev_arg = sock;
2932
2933         isc_task_send(sender, (isc_event_t **)&iev);
2934 }
2935
2936 static void
2937 dispatch_send(isc__socket_t *sock) {
2938         intev_t *iev;
2939         isc_socketevent_t *ev;
2940         isc_task_t *sender;
2941
2942         INSIST(!sock->pending_send);
2943
2944         if (sock->type != isc_sockettype_fdwatch) {
2945                 ev = ISC_LIST_HEAD(sock->send_list);
2946                 if (ev == NULL)
2947                         return;
2948                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2949                            "dispatch_send:  event %p -> task %p",
2950                            ev, ev->ev_sender);
2951                 sender = ev->ev_sender;
2952         } else {
2953                 sender = sock->fdwatchtask;
2954         }
2955
2956         sock->pending_send = 1;
2957         iev = &sock->writable_ev;
2958
2959         sock->references++;
2960         iev->ev_sender = sock;
2961         if (sock->type == isc_sockettype_fdwatch)
2962                 iev->ev_action = internal_fdwatch_write;
2963         else
2964                 iev->ev_action = internal_send;
2965         iev->ev_arg = sock;
2966
2967         isc_task_send(sender, (isc_event_t **)&iev);
2968 }
2969
2970 /*
2971  * Dispatch an internal accept event.
2972  */
2973 static void
2974 dispatch_accept(isc__socket_t *sock) {
2975         intev_t *iev;
2976         isc_socket_newconnev_t *ev;
2977
2978         INSIST(!sock->pending_accept);
2979
2980         /*
2981          * Are there any done events left, or were they all canceled
2982          * before the manager got the socket lock?
2983          */
2984         ev = ISC_LIST_HEAD(sock->accept_list);
2985         if (ev == NULL)
2986                 return;
2987
2988         sock->pending_accept = 1;
2989         iev = &sock->readable_ev;
2990
2991         sock->references++;  /* keep socket around for this internal event */
2992         iev->ev_sender = sock;
2993         iev->ev_action = internal_accept;
2994         iev->ev_arg = sock;
2995
2996         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2997 }
2998
2999 static void
3000 dispatch_connect(isc__socket_t *sock) {
3001         intev_t *iev;
3002         isc_socket_connev_t *ev;
3003
3004         iev = &sock->writable_ev;
3005
3006         ev = sock->connect_ev;
3007         INSIST(ev != NULL); /* XXX */
3008
3009         INSIST(sock->connecting);
3010
3011         sock->references++;  /* keep socket around for this internal event */
3012         iev->ev_sender = sock;
3013         iev->ev_action = internal_connect;
3014         iev->ev_arg = sock;
3015
3016         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
3017 }
3018
3019 /*
3020  * Dequeue an item off the given socket's read queue, set the result code
3021  * in the done event to the one provided, and send it to the task it was
3022  * destined for.
3023  *
3024  * If the event to be sent is on a list, remove it before sending.  If
3025  * asked to, send and detach from the socket as well.
3026  *
3027  * Caller must have the socket locked if the event is attached to the socket.
3028  */
3029 static void
3030 send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
3031         isc_task_t *task;
3032
3033         task = (*dev)->ev_sender;
3034
3035         (*dev)->ev_sender = sock;
3036
3037         if (ISC_LINK_LINKED(*dev, ev_link))
3038                 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
3039
3040         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
3041             == ISC_SOCKEVENTATTR_ATTACHED)
3042                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
3043         else
3044                 isc_task_send(task, (isc_event_t **)dev);
3045 }
3046
3047 /*
3048  * See comments for send_recvdone_event() above.
3049  *
3050  * Caller must have the socket locked if the event is attached to the socket.
3051  */
3052 static void
3053 send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
3054         isc_task_t *task;
3055
3056         INSIST(dev != NULL && *dev != NULL);
3057
3058         task = (*dev)->ev_sender;
3059         (*dev)->ev_sender = sock;
3060
3061         if (ISC_LINK_LINKED(*dev, ev_link))
3062                 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
3063
3064         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
3065             == ISC_SOCKEVENTATTR_ATTACHED)
3066                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
3067         else
3068                 isc_task_send(task, (isc_event_t **)dev);
3069 }
3070
3071 /*
3072  * Call accept() on a socket, to get the new file descriptor.  The listen
3073  * socket is used as a prototype to create a new isc_socket_t.  The new
3074  * socket has one outstanding reference.  The task receiving the event
3075  * will be detached from just after the event is delivered.
3076  *
3077  * On entry to this function, the event delivered is the internal
3078  * readable event, and the first item on the accept_list should be
3079  * the done event we want to send.  If the list is empty, this is a no-op,
3080  * so just unlock and return.
3081  */
3082 static void
3083 internal_accept(isc_task_t *me, isc_event_t *ev) {
3084         isc__socket_t *sock;
3085         isc__socketmgr_t *manager;
3086         isc_socket_newconnev_t *dev;
3087         isc_task_t *task;
3088         ISC_SOCKADDR_LEN_T addrlen;
3089         int fd;
3090         isc_result_t result = ISC_R_SUCCESS;
3091         char strbuf[ISC_STRERRORSIZE];
3092         const char *err = "accept";
3093
3094         UNUSED(me);
3095
3096         sock = ev->ev_sender;
3097         INSIST(VALID_SOCKET(sock));
3098
3099         LOCK(&sock->lock);
3100         socket_log(sock, NULL, TRACE,
3101                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
3102                    "internal_accept called, locked socket");
3103
3104         manager = sock->manager;
3105         INSIST(VALID_MANAGER(manager));
3106
3107         INSIST(sock->listener);
3108         INSIST(sock->pending_accept == 1);
3109         sock->pending_accept = 0;
3110
3111         INSIST(sock->references > 0);
3112         sock->references--;  /* the internal event is done with this socket */
3113         if (sock->references == 0) {
3114                 UNLOCK(&sock->lock);
3115                 destroy(&sock);
3116                 return;
3117         }
3118
3119         /*
3120          * Get the first item off the accept list.
3121          * If it is empty, unlock the socket and return.
3122          */
3123         dev = ISC_LIST_HEAD(sock->accept_list);
3124         if (dev == NULL) {
3125                 UNLOCK(&sock->lock);
3126                 return;
3127         }
3128
3129         /*
3130          * Try to accept the new connection.  If the accept fails with
3131          * EAGAIN or EINTR, simply poke the watcher to watch this socket
3132          * again.  Also ignore ECONNRESET, which has been reported to
3133          * be spuriously returned on Linux 2.2.19 although it is not
3134          * a documented error for accept().  ECONNABORTED has been
3135          * reported for Solaris 8.  The rest are thrown in not because
3136          * we have seen them but because they are ignored by other
3137          * daemons such as BIND 8 and Apache.
3138          */
3139
3140         addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
3141         memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
3142         fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
3143                     (void *)&addrlen);
3144
3145 #ifdef F_DUPFD
3146         /*
3147          * Leave a space for stdio to work in.
3148          */
3149         if (fd >= 0 && fd < 20) {
3150                 int new, tmp;
3151                 new = fcntl(fd, F_DUPFD, 20);
3152                 tmp = errno;
3153                 (void)close(fd);
3154                 errno = tmp;
3155                 fd = new;
3156                 err = "accept/fcntl";
3157         }
3158 #endif
3159
3160         if (fd < 0) {
3161                 if (SOFT_ERROR(errno))
3162                         goto soft_error;
3163                 switch (errno) {
3164                 case ENFILE:
3165                 case EMFILE:
3166                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3167                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3168                                        isc_msgcat, ISC_MSGSET_SOCKET,
3169                                        ISC_MSG_TOOMANYFDS,
3170                                        "%s: too many open file descriptors",
3171                                        err);
3172                         goto soft_error;
3173
3174                 case ENOBUFS:
3175                 case ENOMEM:
3176                 case ECONNRESET:
3177                 case ECONNABORTED:
3178                 case EHOSTUNREACH:
3179                 case EHOSTDOWN:
3180                 case ENETUNREACH:
3181                 case ENETDOWN:
3182                 case ECONNREFUSED:
3183 #ifdef EPROTO
3184                 case EPROTO:
3185 #endif
3186 #ifdef ENONET
3187                 case ENONET:
3188 #endif
3189                         goto soft_error;
3190                 default:
3191                         break;
3192                 }
3193                 isc__strerror(errno, strbuf, sizeof(strbuf));
3194                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3195                                  "internal_accept: %s() %s: %s", err,
3196                                  isc_msgcat_get(isc_msgcat,
3197                                                 ISC_MSGSET_GENERAL,
3198                                                 ISC_MSG_FAILED,
3199                                                 "failed"),
3200                                  strbuf);
3201                 fd = -1;
3202                 result = ISC_R_UNEXPECTED;
3203         } else {
3204                 if (addrlen == 0U) {
3205                         UNEXPECTED_ERROR(__FILE__, __LINE__,
3206                                          "internal_accept(): "
3207                                          "accept() failed to return "
3208                                          "remote address");
3209
3210                         (void)close(fd);
3211                         goto soft_error;
3212                 } else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
3213                            sock->pf)
3214                 {
3215                         UNEXPECTED_ERROR(__FILE__, __LINE__,
3216                                          "internal_accept(): "
3217                                          "accept() returned peer address "
3218                                          "family %u (expected %u)",
3219                                          NEWCONNSOCK(dev)->peer_address.
3220                                          type.sa.sa_family,
3221                                          sock->pf);
3222                         (void)close(fd);
3223                         goto soft_error;
3224                 } else if (fd >= (int)manager->maxsocks) {
3225                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3226                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3227                                        isc_msgcat, ISC_MSGSET_SOCKET,
3228                                        ISC_MSG_TOOMANYFDS,
3229                                        "accept: "
3230                                        "file descriptor exceeds limit (%d/%u)",
3231                                        fd, manager->maxsocks);
3232                         (void)close(fd);
3233                         goto soft_error;
3234                 }
3235         }
3236
3237         if (fd != -1) {
3238                 NEWCONNSOCK(dev)->peer_address.length = addrlen;
3239                 NEWCONNSOCK(dev)->pf = sock->pf;
3240         }
3241
3242         /*
3243          * Pull off the done event.
3244          */
3245         ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
3246
3247         /*
3248          * Poke watcher if there are more pending accepts.
3249          */
3250         if (!ISC_LIST_EMPTY(sock->accept_list))
3251                 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
3252
3253         UNLOCK(&sock->lock);
3254
3255         if (fd != -1) {
3256                 result = make_nonblock(fd);
3257                 if (result != ISC_R_SUCCESS) {
3258                         (void)close(fd);
3259                         fd = -1;
3260                 }
3261         }
3262
3263         /*
3264          * -1 means the new socket didn't happen.
3265          */
3266         if (fd != -1) {
3267                 int lockid = FDLOCK_ID(fd);
3268
3269                 LOCK(&manager->fdlock[lockid]);
3270                 manager->fds[fd] = NEWCONNSOCK(dev);
3271                 manager->fdstate[fd] = MANAGED;
3272                 UNLOCK(&manager->fdlock[lockid]);
3273
3274                 LOCK(&manager->lock);
3275                 ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
3276
3277                 NEWCONNSOCK(dev)->fd = fd;
3278                 NEWCONNSOCK(dev)->bound = 1;
3279                 NEWCONNSOCK(dev)->connected = 1;
3280
3281                 /*
3282                  * Save away the remote address
3283                  */
3284                 dev->address = NEWCONNSOCK(dev)->peer_address;
3285
3286 #ifdef USE_SELECT
3287                 if (manager->maxfd < fd)
3288                         manager->maxfd = fd;
3289 #endif
3290
3291                 socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
3292                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
3293                            "accepted connection, new socket %p",
3294                            dev->newsocket);
3295
3296                 UNLOCK(&manager->lock);
3297
3298                 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
3299         } else {
3300                 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3301                 NEWCONNSOCK(dev)->references--;
3302                 free_socket((isc__socket_t **)&dev->newsocket);
3303         }
3304
3305         /*
3306          * Fill in the done event details and send it off.
3307          */
3308         dev->result = result;
3309         task = dev->ev_sender;
3310         dev->ev_sender = sock;
3311
3312         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
3313         return;
3314
3315  soft_error:
3316         select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
3317         UNLOCK(&sock->lock);
3318
3319         inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3320         return;
3321 }
3322
3323 static void
3324 internal_recv(isc_task_t *me, isc_event_t *ev) {
3325         isc_socketevent_t *dev;
3326         isc__socket_t *sock;
3327
3328         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3329
3330         sock = ev->ev_sender;
3331         INSIST(VALID_SOCKET(sock));
3332
3333         LOCK(&sock->lock);
3334         socket_log(sock, NULL, IOEVENT,
3335                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3336                    "internal_recv: task %p got event %p", me, ev);
3337
3338         INSIST(sock->pending_recv == 1);
3339         sock->pending_recv = 0;
3340
3341         INSIST(sock->references > 0);
3342         sock->references--;  /* the internal event is done with this socket */
3343         if (sock->references == 0) {
3344                 UNLOCK(&sock->lock);
3345                 destroy(&sock);
3346                 return;
3347         }
3348
3349         /*
3350          * Try to do as much I/O as possible on this socket.  There are no
3351          * limits here, currently.
3352          */
3353         dev = ISC_LIST_HEAD(sock->recv_list);
3354         while (dev != NULL) {
3355                 switch (doio_recv(sock, dev)) {
3356                 case DOIO_SOFT:
3357                         goto poke;
3358
3359                 case DOIO_EOF:
3360                         /*
3361                          * read of 0 means the remote end was closed.
3362                          * Run through the event queue and dispatch all
3363                          * the events with an EOF result code.
3364                          */
3365                         do {
3366                                 dev->result = ISC_R_EOF;
3367                                 send_recvdone_event(sock, &dev);
3368                                 dev = ISC_LIST_HEAD(sock->recv_list);
3369                         } while (dev != NULL);
3370                         goto poke;
3371
3372                 case DOIO_SUCCESS:
3373                 case DOIO_HARD:
3374                         send_recvdone_event(sock, &dev);
3375                         break;
3376                 }
3377
3378                 dev = ISC_LIST_HEAD(sock->recv_list);
3379         }
3380
3381  poke:
3382         if (!ISC_LIST_EMPTY(sock->recv_list))
3383                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3384
3385         UNLOCK(&sock->lock);
3386 }
3387
3388 static void
3389 internal_send(isc_task_t *me, isc_event_t *ev) {
3390         isc_socketevent_t *dev;
3391         isc__socket_t *sock;
3392
3393         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3394
3395         /*
3396          * Find out what socket this is and lock it.
3397          */
3398         sock = (isc__socket_t *)ev->ev_sender;
3399         INSIST(VALID_SOCKET(sock));
3400
3401         LOCK(&sock->lock);
3402         socket_log(sock, NULL, IOEVENT,
3403                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3404                    "internal_send: task %p got event %p", me, ev);
3405
3406         INSIST(sock->pending_send == 1);
3407         sock->pending_send = 0;
3408
3409         INSIST(sock->references > 0);
3410         sock->references--;  /* the internal event is done with this socket */
3411         if (sock->references == 0) {
3412                 UNLOCK(&sock->lock);
3413                 destroy(&sock);
3414                 return;
3415         }
3416
3417         /*
3418          * Try to do as much I/O as possible on this socket.  There are no
3419          * limits here, currently.
3420          */
3421         dev = ISC_LIST_HEAD(sock->send_list);
3422         while (dev != NULL) {
3423                 switch (doio_send(sock, dev)) {
3424                 case DOIO_SOFT:
3425                         goto poke;
3426
3427                 case DOIO_HARD:
3428                 case DOIO_SUCCESS:
3429                         send_senddone_event(sock, &dev);
3430                         break;
3431                 }
3432
3433                 dev = ISC_LIST_HEAD(sock->send_list);
3434         }
3435
3436  poke:
3437         if (!ISC_LIST_EMPTY(sock->send_list))
3438                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3439
3440         UNLOCK(&sock->lock);
3441 }
3442
3443 static void
3444 internal_fdwatch_write(isc_task_t *me, isc_event_t *ev) {
3445         isc__socket_t *sock;
3446         int more_data;
3447
3448         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3449
3450         /*
3451          * Find out what socket this is and lock it.
3452          */
3453         sock = (isc__socket_t *)ev->ev_sender;
3454         INSIST(VALID_SOCKET(sock));
3455
3456         LOCK(&sock->lock);
3457         socket_log(sock, NULL, IOEVENT,
3458                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3459                    "internal_fdwatch_write: task %p got event %p", me, ev);
3460
3461         INSIST(sock->pending_send == 1);
3462
3463         UNLOCK(&sock->lock);
3464         more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock,
3465                                       sock->fdwatcharg, ISC_SOCKFDWATCH_WRITE);
3466         LOCK(&sock->lock);
3467
3468         sock->pending_send = 0;
3469
3470         INSIST(sock->references > 0);
3471         sock->references--;  /* the internal event is done with this socket */
3472         if (sock->references == 0) {
3473                 UNLOCK(&sock->lock);
3474                 destroy(&sock);
3475                 return;
3476         }
3477
3478         if (more_data)
3479                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3480
3481         UNLOCK(&sock->lock);
3482 }
3483
3484 static void
3485 internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) {
3486         isc__socket_t *sock;
3487         int more_data;
3488
3489         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3490
3491         /*
3492          * Find out what socket this is and lock it.
3493          */
3494         sock = (isc__socket_t *)ev->ev_sender;
3495         INSIST(VALID_SOCKET(sock));
3496
3497         LOCK(&sock->lock);
3498         socket_log(sock, NULL, IOEVENT,
3499                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3500                    "internal_fdwatch_read: task %p got event %p", me, ev);
3501
3502         INSIST(sock->pending_recv == 1);
3503
3504         UNLOCK(&sock->lock);
3505         more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock,
3506                                       sock->fdwatcharg, ISC_SOCKFDWATCH_READ);
3507         LOCK(&sock->lock);
3508
3509         sock->pending_recv = 0;
3510
3511         INSIST(sock->references > 0);
3512         sock->references--;  /* the internal event is done with this socket */
3513         if (sock->references == 0) {
3514                 UNLOCK(&sock->lock);
3515                 destroy(&sock);
3516                 return;
3517         }
3518
3519         if (more_data)
3520                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3521
3522         UNLOCK(&sock->lock);
3523 }
3524
3525 /*
3526  * Process read/writes on each fd here.  Avoid locking
3527  * and unlocking twice if both reads and writes are possible.
3528  */
3529 static void
3530 process_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable,
3531            isc_boolean_t writeable)
3532 {
3533         isc__socket_t *sock;
3534         isc_boolean_t unlock_sock;
3535         isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
3536         int lockid = FDLOCK_ID(fd);
3537
3538         /*
3539          * If the socket is going to be closed, don't do more I/O.
3540          */
3541         LOCK(&manager->fdlock[lockid]);
3542         if (manager->fdstate[fd] == CLOSE_PENDING) {
3543                 UNLOCK(&manager->fdlock[lockid]);
3544
3545                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3546                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3547                 return;
3548         }
3549
3550         sock = manager->fds[fd];
3551         unlock_sock = ISC_FALSE;
3552         if (readable) {
3553                 if (sock == NULL) {
3554                         unwatch_read = ISC_TRUE;
3555                         goto check_write;
3556                 }
3557                 unlock_sock = ISC_TRUE;
3558                 LOCK(&sock->lock);
3559                 if (!SOCK_DEAD(sock)) {
3560                         if (sock->listener)
3561                                 dispatch_accept(sock);
3562                         else
3563                                 dispatch_recv(sock);
3564                 }
3565                 unwatch_read = ISC_TRUE;
3566         }
3567 check_write:
3568         if (writeable) {
3569                 if (sock == NULL) {
3570                         unwatch_write = ISC_TRUE;
3571                         goto unlock_fd;
3572                 }
3573                 if (!unlock_sock) {
3574                         unlock_sock = ISC_TRUE;
3575                         LOCK(&sock->lock);
3576                 }
3577                 if (!SOCK_DEAD(sock)) {
3578                         if (sock->connecting)
3579                                 dispatch_connect(sock);
3580                         else
3581                                 dispatch_send(sock);
3582                 }
3583                 unwatch_write = ISC_TRUE;
3584         }
3585         if (unlock_sock)
3586                 UNLOCK(&sock->lock);
3587
3588  unlock_fd:
3589         UNLOCK(&manager->fdlock[lockid]);
3590         if (unwatch_read)
3591                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3592         if (unwatch_write)
3593                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3594
3595 }
3596
3597 #ifdef USE_KQUEUE
3598 static isc_boolean_t
3599 process_fds(isc__socketmgr_t *manager, struct kevent *events, int nevents) {
3600         int i;
3601         isc_boolean_t readable, writable;
3602         isc_boolean_t done = ISC_FALSE;
3603 #ifdef USE_WATCHER_THREAD
3604         isc_boolean_t have_ctlevent = ISC_FALSE;
3605 #endif
3606
3607         if (nevents == manager->nevents) {
3608                 /*
3609                  * This is not an error, but something unexpected.  If this
3610                  * happens, it may indicate the need for increasing
3611                  * ISC_SOCKET_MAXEVENTS.
3612                  */
3613                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3614                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3615                             "maximum number of FD events (%d) received",
3616                             nevents);
3617         }
3618
3619         for (i = 0; i < nevents; i++) {
3620                 REQUIRE(events[i].ident < manager->maxsocks);
3621 #ifdef USE_WATCHER_THREAD
3622                 if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) {
3623                         have_ctlevent = ISC_TRUE;
3624                         continue;
3625                 }
3626 #endif
3627                 readable = ISC_TF(events[i].filter == EVFILT_READ);
3628                 writable = ISC_TF(events[i].filter == EVFILT_WRITE);
3629                 process_fd(manager, events[i].ident, readable, writable);
3630         }
3631
3632 #ifdef USE_WATCHER_THREAD
3633         if (have_ctlevent)
3634                 done = process_ctlfd(manager);
3635 #endif
3636
3637         return (done);
3638 }
3639 #elif defined(USE_EPOLL)
3640 static isc_boolean_t
3641 process_fds(isc__socketmgr_t *manager, struct epoll_event *events, int nevents)
3642 {
3643         int i;
3644         isc_boolean_t done = ISC_FALSE;
3645 #ifdef USE_WATCHER_THREAD
3646         isc_boolean_t have_ctlevent = ISC_FALSE;
3647 #endif
3648
3649         if (nevents == manager->nevents) {
3650                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3651                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3652                             "maximum number of FD events (%d) received",
3653                             nevents);
3654         }
3655
3656         for (i = 0; i < nevents; i++) {
3657                 REQUIRE(events[i].data.fd < (int)manager->maxsocks);
3658 #ifdef USE_WATCHER_THREAD
3659                 if (events[i].data.fd == manager->pipe_fds[0]) {
3660                         have_ctlevent = ISC_TRUE;
3661                         continue;
3662                 }
3663 #endif
3664                 if ((events[i].events & EPOLLERR) != 0 ||
3665                     (events[i].events & EPOLLHUP) != 0) {
3666                         /*
3667                          * epoll does not set IN/OUT bits on an erroneous
3668                          * condition, so we need to try both anyway.  This is a
3669                          * bit inefficient, but should be okay for such rare
3670                          * events.  Note also that the read or write attempt
3671                          * won't block because we use non-blocking sockets.
3672                          */
3673                         events[i].events |= (EPOLLIN | EPOLLOUT);
3674                 }
3675                 process_fd(manager, events[i].data.fd,
3676                            (events[i].events & EPOLLIN) != 0,
3677                            (events[i].events & EPOLLOUT) != 0);
3678         }
3679
3680 #ifdef USE_WATCHER_THREAD
3681         if (have_ctlevent)
3682                 done = process_ctlfd(manager);
3683 #endif
3684
3685         return (done);
3686 }
3687 #elif defined(USE_DEVPOLL)
3688 static isc_boolean_t
3689 process_fds(isc__socketmgr_t *manager, struct pollfd *events, int nevents) {
3690         int i;
3691         isc_boolean_t done = ISC_FALSE;
3692 #ifdef USE_WATCHER_THREAD
3693         isc_boolean_t have_ctlevent = ISC_FALSE;
3694 #endif
3695
3696         if (nevents == manager->nevents) {
3697                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3698                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3699                             "maximum number of FD events (%d) received",
3700                             nevents);
3701         }
3702
3703         for (i = 0; i < nevents; i++) {
3704                 REQUIRE(events[i].fd < (int)manager->maxsocks);
3705 #ifdef USE_WATCHER_THREAD
3706                 if (events[i].fd == manager->pipe_fds[0]) {
3707                         have_ctlevent = ISC_TRUE;
3708                         continue;
3709                 }
3710 #endif
3711                 process_fd(manager, events[i].fd,
3712                            (events[i].events & POLLIN) != 0,
3713                            (events[i].events & POLLOUT) != 0);
3714         }
3715
3716 #ifdef USE_WATCHER_THREAD
3717         if (have_ctlevent)
3718                 done = process_ctlfd(manager);
3719 #endif
3720
3721         return (done);
3722 }
3723 #elif defined(USE_SELECT)
3724 static void
3725 process_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds,
3726             fd_set *writefds)
3727 {
3728         int i;
3729
3730         REQUIRE(maxfd <= (int)manager->maxsocks);
3731
3732         for (i = 0; i < maxfd; i++) {
3733 #ifdef USE_WATCHER_THREAD
3734                 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
3735                         continue;
3736 #endif /* USE_WATCHER_THREAD */
3737                 process_fd(manager, i, FD_ISSET(i, readfds),
3738                            FD_ISSET(i, writefds));
3739         }
3740 }
3741 #endif
3742
3743 #ifdef USE_WATCHER_THREAD
3744 static isc_boolean_t
3745 process_ctlfd(isc__socketmgr_t *manager) {
3746         int msg, fd;
3747
3748         for (;;) {
3749                 select_readmsg(manager, &fd, &msg);
3750
3751                 manager_log(manager, IOEVENT,
3752                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
3753                                            ISC_MSG_WATCHERMSG,
3754                                            "watcher got message %d "
3755                                            "for socket %d"), msg, fd);
3756
3757                 /*
3758                  * Nothing to read?
3759                  */
3760                 if (msg == SELECT_POKE_NOTHING)
3761                         break;
3762
3763                 /*
3764                  * Handle shutdown message.  We really should
3765                  * jump out of this loop right away, but
3766                  * it doesn't matter if we have to do a little
3767                  * more work first.
3768                  */
3769                 if (msg == SELECT_POKE_SHUTDOWN)
3770                         return (ISC_TRUE);
3771
3772                 /*
3773                  * This is a wakeup on a socket.  Look
3774                  * at the event queue for both read and write,
3775                  * and decide if we need to watch on it now
3776                  * or not.
3777                  */
3778                 wakeup_socket(manager, fd, msg);
3779         }
3780
3781         return (ISC_FALSE);
3782 }
3783
3784 /*
3785  * This is the thread that will loop forever, always in a select or poll
3786  * call.
3787  *
3788  * When select returns something to do, track down what thread gets to do
3789  * this I/O and post the event to it.
3790  */
3791 static isc_threadresult_t
3792 watcher(void *uap) {
3793         isc__socketmgr_t *manager = uap;
3794         isc_boolean_t done;
3795         int cc;
3796 #ifdef USE_KQUEUE
3797         const char *fnname = "kevent()";
3798 #elif defined (USE_EPOLL)
3799         const char *fnname = "epoll_wait()";
3800 #elif defined(USE_DEVPOLL)
3801         const char *fnname = "ioctl(DP_POLL)";
3802         struct dvpoll dvp;
3803 #elif defined (USE_SELECT)
3804         const char *fnname = "select()";
3805         int maxfd;
3806         int ctlfd;
3807 #endif
3808         char strbuf[ISC_STRERRORSIZE];
3809 #ifdef ISC_SOCKET_USE_POLLWATCH
3810         pollstate_t pollstate = poll_idle;
3811 #endif
3812
3813 #if defined (USE_SELECT)
3814         /*
3815          * Get the control fd here.  This will never change.
3816          */
3817         ctlfd = manager->pipe_fds[0];
3818 #endif
3819         done = ISC_FALSE;
3820         while (!done) {
3821                 do {
3822 #ifdef USE_KQUEUE
3823                         cc = kevent(manager->kqueue_fd, NULL, 0,
3824                                     manager->events, manager->nevents, NULL);
3825 #elif defined(USE_EPOLL)
3826                         cc = epoll_wait(manager->epoll_fd, manager->events,
3827                                         manager->nevents, -1);
3828 #elif defined(USE_DEVPOLL)
3829                         dvp.dp_fds = manager->events;
3830                         dvp.dp_nfds = manager->nevents;
3831 #ifndef ISC_SOCKET_USE_POLLWATCH
3832                         dvp.dp_timeout = -1;
3833 #else
3834                         if (pollstate == poll_idle)
3835                                 dvp.dp_timeout = -1;
3836                         else
3837                                 dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT;
3838 #endif  /* ISC_SOCKET_USE_POLLWATCH */
3839                         cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
3840 #elif defined(USE_SELECT)
3841                         LOCK(&manager->lock);
3842                         memcpy(manager->read_fds_copy, manager->read_fds,
3843                                manager->fd_bufsize);
3844                         memcpy(manager->write_fds_copy, manager->write_fds,
3845                                manager->fd_bufsize);
3846                         maxfd = manager->maxfd + 1;
3847                         UNLOCK(&manager->lock);
3848
3849                         cc = select(maxfd, manager->read_fds_copy,
3850                                     manager->write_fds_copy, NULL, NULL);
3851 #endif  /* USE_KQUEUE */
3852
3853                         if (cc < 0 && !SOFT_ERROR(errno)) {
3854                                 isc__strerror(errno, strbuf, sizeof(strbuf));
3855                                 FATAL_ERROR(__FILE__, __LINE__,
3856                                             "%s %s: %s", fnname,
3857                                             isc_msgcat_get(isc_msgcat,
3858                                                            ISC_MSGSET_GENERAL,
3859                                                            ISC_MSG_FAILED,
3860                                                            "failed"), strbuf);
3861                         }
3862
3863 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3864                         if (cc == 0) {
3865                                 if (pollstate == poll_active)
3866                                         pollstate = poll_checking;
3867                                 else if (pollstate == poll_checking)
3868                                         pollstate = poll_idle;
3869                         } else if (cc > 0) {
3870                                 if (pollstate == poll_checking) {
3871                                         /*
3872                                          * XXX: We'd like to use a more
3873                                          * verbose log level as it's actually an
3874                                          * unexpected event, but the kernel bug
3875                                          * reportedly happens pretty frequently
3876                                          * (and it can also be a false positive)
3877                                          * so it would be just too noisy.
3878                                          */
3879                                         manager_log(manager,
3880                                                     ISC_LOGCATEGORY_GENERAL,
3881                                                     ISC_LOGMODULE_SOCKET,
3882                                                     ISC_LOG_DEBUG(1),
3883                                                     "unexpected POLL timeout");
3884                                 }
3885                                 pollstate = poll_active;
3886                         }
3887 #endif
3888                 } while (cc < 0);
3889
3890 #if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL)
3891                 done = process_fds(manager, manager->events, cc);
3892 #elif defined(USE_SELECT)
3893                 process_fds(manager, maxfd, manager->read_fds_copy,
3894                             manager->write_fds_copy);
3895
3896                 /*
3897                  * Process reads on internal, control fd.
3898                  */
3899                 if (FD_ISSET(ctlfd, manager->read_fds_copy))
3900                         done = process_ctlfd(manager);
3901 #endif
3902         }
3903
3904         manager_log(manager, TRACE, "%s",
3905                     isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3906                                    ISC_MSG_EXITING, "watcher exiting"));
3907
3908         return ((isc_threadresult_t)0);
3909 }
3910 #endif /* USE_WATCHER_THREAD */
3911
3912 #ifdef BIND9
3913 ISC_SOCKETFUNC_SCOPE void
3914 isc__socketmgr_setreserved(isc_socketmgr_t *manager0, isc_uint32_t reserved) {
3915         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3916
3917         REQUIRE(VALID_MANAGER(manager));
3918
3919         manager->reserved = reserved;
3920 }
3921
3922 ISC_SOCKETFUNC_SCOPE void
3923 isc___socketmgr_maxudp(isc_socketmgr_t *manager0, int maxudp) {
3924         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3925
3926         REQUIRE(VALID_MANAGER(manager));
3927
3928         manager->maxudp = maxudp;
3929 }
3930 #endif  /* BIND9 */
3931
3932 /*
3933  * Create a new socket manager.
3934  */
3935
3936 static isc_result_t
3937 setup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) {
3938         isc_result_t result;
3939 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3940         char strbuf[ISC_STRERRORSIZE];
3941 #endif
3942
3943 #ifdef USE_KQUEUE
3944         manager->nevents = ISC_SOCKET_MAXEVENTS;
3945         manager->events = isc_mem_get(mctx, sizeof(struct kevent) *
3946                                       manager->nevents);
3947         if (manager->events == NULL)
3948                 return (ISC_R_NOMEMORY);
3949         manager->kqueue_fd = kqueue();
3950         if (manager->kqueue_fd == -1) {
3951                 result = isc__errno2result(errno);
3952                 isc__strerror(errno, strbuf, sizeof(strbuf));
3953                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3954                                  "kqueue %s: %s",
3955                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3956                                                 ISC_MSG_FAILED, "failed"),
3957                                  strbuf);
3958                 isc_mem_put(mctx, manager->events,
3959                             sizeof(struct kevent) * manager->nevents);
3960                 return (result);
3961         }
3962
3963 #ifdef USE_WATCHER_THREAD
3964         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3965         if (result != ISC_R_SUCCESS) {
3966                 close(manager->kqueue_fd);
3967                 isc_mem_put(mctx, manager->events,
3968                             sizeof(struct kevent) * manager->nevents);
3969                 return (result);
3970         }
3971 #endif  /* USE_WATCHER_THREAD */
3972 #elif defined(USE_EPOLL)
3973         manager->nevents = ISC_SOCKET_MAXEVENTS;
3974         manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) *
3975                                       manager->nevents);
3976         if (manager->events == NULL)
3977                 return (ISC_R_NOMEMORY);
3978         manager->epoll_fd = epoll_create(manager->nevents);
3979         if (manager->epoll_fd == -1) {
3980                 result = isc__errno2result(errno);
3981                 isc__strerror(errno, strbuf, sizeof(strbuf));
3982                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3983                                  "epoll_create %s: %s",
3984                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3985                                                 ISC_MSG_FAILED, "failed"),
3986                                  strbuf);
3987                 isc_mem_put(mctx, manager->events,
3988                             sizeof(struct epoll_event) * manager->nevents);
3989                 return (result);
3990         }
3991 #ifdef USE_WATCHER_THREAD
3992         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3993         if (result != ISC_R_SUCCESS) {
3994                 close(manager->epoll_fd);
3995                 isc_mem_put(mctx, manager->events,
3996                             sizeof(struct epoll_event) * manager->nevents);
3997                 return (result);
3998         }
3999 #endif  /* USE_WATCHER_THREAD */
4000 #elif defined(USE_DEVPOLL)
4001         /*
4002          * XXXJT: /dev/poll seems to reject large numbers of events,
4003          * so we should be careful about redefining ISC_SOCKET_MAXEVENTS.
4004          */
4005         manager->nevents = ISC_SOCKET_MAXEVENTS;
4006         manager->events = isc_mem_get(mctx, sizeof(struct pollfd) *
4007                                       manager->nevents);
4008         if (manager->events == NULL)
4009                 return (ISC_R_NOMEMORY);
4010         /*
4011          * Note: fdpollinfo should be able to support all possible FDs, so
4012          * it must have maxsocks entries (not nevents).
4013          */
4014         manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) *
4015                                           manager->maxsocks);
4016         if (manager->fdpollinfo == NULL) {
4017                 isc_mem_put(mctx, manager->events,
4018                             sizeof(struct pollfd) * manager->nevents);
4019                 return (ISC_R_NOMEMORY);
4020         }
4021         memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks);
4022         manager->devpoll_fd = open("/dev/poll", O_RDWR);
4023         if (manager->devpoll_fd == -1) {
4024                 result = isc__errno2result(errno);
4025                 isc__strerror(errno, strbuf, sizeof(strbuf));
4026                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4027                                  "open(/dev/poll) %s: %s",
4028                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4029                                                 ISC_MSG_FAILED, "failed"),
4030                                  strbuf);
4031                 isc_mem_put(mctx, manager->events,
4032                             sizeof(struct pollfd) * manager->nevents);
4033                 isc_mem_put(mctx, manager->fdpollinfo,
4034                             sizeof(pollinfo_t) * manager->maxsocks);
4035                 return (result);
4036         }
4037 #ifdef USE_WATCHER_THREAD
4038         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
4039         if (result != ISC_R_SUCCESS) {
4040                 close(manager->devpoll_fd);
4041                 isc_mem_put(mctx, manager->events,
4042                             sizeof(struct pollfd) * manager->nevents);
4043                 isc_mem_put(mctx, manager->fdpollinfo,
4044                             sizeof(pollinfo_t) * manager->maxsocks);
4045                 return (result);
4046         }
4047 #endif  /* USE_WATCHER_THREAD */
4048 #elif defined(USE_SELECT)
4049         UNUSED(result);
4050
4051 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
4052         /*
4053          * Note: this code should also cover the case of MAXSOCKETS <=
4054          * FD_SETSIZE, but we separate the cases to avoid possible portability
4055          * issues regarding howmany() and the actual representation of fd_set.
4056          */
4057         manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
4058                 sizeof(fd_mask);
4059 #else
4060         manager->fd_bufsize = sizeof(fd_set);
4061 #endif
4062
4063         manager->read_fds = NULL;
4064         manager->read_fds_copy = NULL;
4065         manager->write_fds = NULL;
4066         manager->write_fds_copy = NULL;
4067
4068         manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
4069         if (manager->read_fds != NULL)
4070                 manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
4071         if (manager->read_fds_copy != NULL)
4072                 manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
4073         if (manager->write_fds != NULL) {
4074                 manager->write_fds_copy = isc_mem_get(mctx,
4075                                                       manager->fd_bufsize);
4076         }
4077         if (manager->write_fds_copy == NULL) {
4078                 if (manager->write_fds != NULL) {
4079                         isc_mem_put(mctx, manager->write_fds,
4080                                     manager->fd_bufsize);
4081                 }
4082                 if (manager->read_fds_copy != NULL) {
4083                         isc_mem_put(mctx, manager->read_fds_copy,
4084                                     manager->fd_bufsize);
4085                 }
4086                 if (manager->read_fds != NULL) {
4087                         isc_mem_put(mctx, manager->read_fds,
4088                                     manager->fd_bufsize);
4089                 }
4090                 return (ISC_R_NOMEMORY);
4091         }
4092         memset(manager->read_fds, 0, manager->fd_bufsize);
4093         memset(manager->write_fds, 0, manager->fd_bufsize);
4094
4095 #ifdef USE_WATCHER_THREAD
4096         (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
4097         manager->maxfd = manager->pipe_fds[0];
4098 #else /* USE_WATCHER_THREAD */
4099         manager->maxfd = 0;
4100 #endif /* USE_WATCHER_THREAD */
4101 #endif  /* USE_KQUEUE */
4102
4103         return (ISC_R_SUCCESS);
4104 }
4105
4106 static void
4107 cleanup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) {
4108 #ifdef USE_WATCHER_THREAD
4109         isc_result_t result;
4110
4111         result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
4112         if (result != ISC_R_SUCCESS) {
4113                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4114                                  "epoll_ctl(DEL) %s",
4115                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4116                                                 ISC_MSG_FAILED, "failed"));
4117         }
4118 #endif  /* USE_WATCHER_THREAD */
4119
4120 #ifdef USE_KQUEUE
4121         close(manager->kqueue_fd);
4122         isc_mem_put(mctx, manager->events,
4123                     sizeof(struct kevent) * manager->nevents);
4124 #elif defined(USE_EPOLL)
4125         close(manager->epoll_fd);
4126         isc_mem_put(mctx, manager->events,
4127                     sizeof(struct epoll_event) * manager->nevents);
4128 #elif defined(USE_DEVPOLL)
4129         close(manager->devpoll_fd);
4130         isc_mem_put(mctx, manager->events,
4131                     sizeof(struct pollfd) * manager->nevents);
4132         isc_mem_put(mctx, manager->fdpollinfo,
4133                     sizeof(pollinfo_t) * manager->maxsocks);
4134 #elif defined(USE_SELECT)
4135         if (manager->read_fds != NULL)
4136                 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
4137         if (manager->read_fds_copy != NULL)
4138                 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize);
4139         if (manager->write_fds != NULL)
4140                 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
4141         if (manager->write_fds_copy != NULL)
4142                 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
4143 #endif  /* USE_KQUEUE */
4144 }
4145
4146 ISC_SOCKETFUNC_SCOPE isc_result_t
4147 isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
4148         return (isc__socketmgr_create2(mctx, managerp, 0));
4149 }
4150
4151 ISC_SOCKETFUNC_SCOPE isc_result_t
4152 isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
4153                        unsigned int maxsocks)
4154 {
4155         int i;
4156         isc__socketmgr_t *manager;
4157 #ifdef USE_WATCHER_THREAD
4158         char strbuf[ISC_STRERRORSIZE];
4159 #endif
4160         isc_result_t result;
4161
4162         REQUIRE(managerp != NULL && *managerp == NULL);
4163
4164 #ifdef USE_SHARED_MANAGER
4165         if (socketmgr != NULL) {
4166                 /* Don't allow maxsocks to be updated */
4167                 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
4168                         return (ISC_R_EXISTS);
4169
4170                 socketmgr->refs++;
4171                 *managerp = (isc_socketmgr_t *)socketmgr;
4172                 return (ISC_R_SUCCESS);
4173         }
4174 #endif /* USE_SHARED_MANAGER */
4175
4176         if (maxsocks == 0)
4177                 maxsocks = ISC_SOCKET_MAXSOCKETS;
4178
4179         manager = isc_mem_get(mctx, sizeof(*manager));
4180         if (manager == NULL)
4181                 return (ISC_R_NOMEMORY);
4182
4183         /* zero-clear so that necessary cleanup on failure will be easy */
4184         memset(manager, 0, sizeof(*manager));
4185         manager->maxsocks = maxsocks;
4186         manager->reserved = 0;
4187         manager->maxudp = 0;
4188         manager->fds = isc_mem_get(mctx,
4189                                    manager->maxsocks * sizeof(isc__socket_t *));
4190         if (manager->fds == NULL) {
4191                 result = ISC_R_NOMEMORY;
4192                 goto free_manager;
4193         }
4194         manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int));
4195         if (manager->fdstate == NULL) {
4196                 result = ISC_R_NOMEMORY;
4197                 goto free_manager;
4198         }
4199         manager->stats = NULL;
4200
4201         manager->common.methods = &socketmgrmethods;
4202         manager->common.magic = ISCAPI_SOCKETMGR_MAGIC;
4203         manager->common.impmagic = SOCKET_MANAGER_MAGIC;
4204         manager->mctx = NULL;
4205         memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
4206         ISC_LIST_INIT(manager->socklist);
4207         result = isc_mutex_init(&manager->lock);
4208         if (result != ISC_R_SUCCESS)
4209                 goto free_manager;
4210         manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t));
4211         if (manager->fdlock == NULL) {
4212                 result = ISC_R_NOMEMORY;
4213                 goto cleanup_lock;
4214         }
4215         for (i = 0; i < FDLOCK_COUNT; i++) {
4216                 result = isc_mutex_init(&manager->fdlock[i]);
4217                 if (result != ISC_R_SUCCESS) {
4218                         while (--i >= 0)
4219                                 DESTROYLOCK(&manager->fdlock[i]);
4220                         isc_mem_put(mctx, manager->fdlock,
4221                                     FDLOCK_COUNT * sizeof(isc_mutex_t));
4222                         manager->fdlock = NULL;
4223                         goto cleanup_lock;
4224                 }
4225         }
4226
4227 #ifdef USE_WATCHER_THREAD
4228         if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
4229                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4230                                  "isc_condition_init() %s",
4231                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4232                                                 ISC_MSG_FAILED, "failed"));
4233                 result = ISC_R_UNEXPECTED;
4234                 goto cleanup_lock;
4235         }
4236
4237         /*
4238          * Create the special fds that will be used to wake up the
4239          * select/poll loop when something internal needs to be done.
4240          */
4241         if (pipe(manager->pipe_fds) != 0) {
4242                 isc__strerror(errno, strbuf, sizeof(strbuf));
4243                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4244                                  "pipe() %s: %s",
4245                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4246                                                 ISC_MSG_FAILED, "failed"),
4247                                  strbuf);
4248                 result = ISC_R_UNEXPECTED;
4249                 goto cleanup_condition;
4250         }
4251
4252         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
4253 #if 0
4254         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
4255 #endif
4256 #endif  /* USE_WATCHER_THREAD */
4257
4258 #ifdef USE_SHARED_MANAGER
4259         manager->refs = 1;
4260 #endif /* USE_SHARED_MANAGER */
4261
4262         /*
4263          * Set up initial state for the select loop
4264          */
4265         result = setup_watcher(mctx, manager);
4266         if (result != ISC_R_SUCCESS)
4267                 goto cleanup;
4268         memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
4269 #ifdef USE_WATCHER_THREAD
4270         /*
4271          * Start up the select/poll thread.
4272          */
4273         if (isc_thread_create(watcher, manager, &manager->watcher) !=
4274             ISC_R_SUCCESS) {
4275                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4276                                  "isc_thread_create() %s",
4277                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4278                                                 ISC_MSG_FAILED, "failed"));
4279                 cleanup_watcher(mctx, manager);
4280                 result = ISC_R_UNEXPECTED;
4281                 goto cleanup;
4282         }
4283 #endif /* USE_WATCHER_THREAD */
4284         isc_mem_attach(mctx, &manager->mctx);
4285
4286 #ifdef USE_SHARED_MANAGER
4287         socketmgr = manager;
4288 #endif /* USE_SHARED_MANAGER */
4289         *managerp = (isc_socketmgr_t *)manager;
4290
4291         return (ISC_R_SUCCESS);
4292
4293 cleanup:
4294 #ifdef USE_WATCHER_THREAD
4295         (void)close(manager->pipe_fds[0]);
4296         (void)close(manager->pipe_fds[1]);
4297 #endif  /* USE_WATCHER_THREAD */
4298
4299 #ifdef USE_WATCHER_THREAD
4300 cleanup_condition:
4301         (void)isc_condition_destroy(&manager->shutdown_ok);
4302 #endif  /* USE_WATCHER_THREAD */
4303
4304
4305 cleanup_lock:
4306         if (manager->fdlock != NULL) {
4307                 for (i = 0; i < FDLOCK_COUNT; i++)
4308                         DESTROYLOCK(&manager->fdlock[i]);
4309         }
4310         DESTROYLOCK(&manager->lock);
4311
4312 free_manager:
4313         if (manager->fdlock != NULL) {
4314                 isc_mem_put(mctx, manager->fdlock,
4315                             FDLOCK_COUNT * sizeof(isc_mutex_t));
4316         }
4317         if (manager->fdstate != NULL) {
4318                 isc_mem_put(mctx, manager->fdstate,
4319                             manager->maxsocks * sizeof(int));
4320         }
4321         if (manager->fds != NULL) {
4322                 isc_mem_put(mctx, manager->fds,
4323                             manager->maxsocks * sizeof(isc_socket_t *));
4324         }
4325         isc_mem_put(mctx, manager, sizeof(*manager));
4326
4327         return (result);
4328 }
4329
4330 #ifdef BIND9
4331 isc_result_t
4332 isc__socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp) {
4333         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
4334         REQUIRE(VALID_MANAGER(manager));
4335         REQUIRE(nsockp != NULL);
4336
4337         *nsockp = manager->maxsocks;
4338
4339         return (ISC_R_SUCCESS);
4340 }
4341
4342 void
4343 isc__socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats) {
4344         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
4345
4346         REQUIRE(VALID_MANAGER(manager));
4347         REQUIRE(ISC_LIST_EMPTY(manager->socklist));
4348         REQUIRE(manager->stats == NULL);
4349         REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
4350
4351         isc_stats_attach(stats, &manager->stats);
4352 }
4353 #endif
4354
4355 ISC_SOCKETFUNC_SCOPE void
4356 isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
4357         isc__socketmgr_t *manager;
4358         int i;
4359         isc_mem_t *mctx;
4360
4361         /*
4362          * Destroy a socket manager.
4363          */
4364
4365         REQUIRE(managerp != NULL);
4366         manager = (isc__socketmgr_t *)*managerp;
4367         REQUIRE(VALID_MANAGER(manager));
4368
4369 #ifdef USE_SHARED_MANAGER
4370         manager->refs--;
4371         if (manager->refs > 0) {
4372                 *managerp = NULL;
4373                 return;
4374         }
4375         socketmgr = NULL;
4376 #endif /* USE_SHARED_MANAGER */
4377
4378         LOCK(&manager->lock);
4379
4380         /*
4381          * Wait for all sockets to be destroyed.
4382          */
4383         while (!ISC_LIST_EMPTY(manager->socklist)) {
4384 #ifdef USE_WATCHER_THREAD
4385                 manager_log(manager, CREATION, "%s",
4386                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
4387                                            ISC_MSG_SOCKETSREMAIN,
4388                                            "sockets exist"));
4389                 WAIT(&manager->shutdown_ok, &manager->lock);
4390 #else /* USE_WATCHER_THREAD */
4391                 UNLOCK(&manager->lock);
4392                 isc__taskmgr_dispatch(NULL);
4393                 LOCK(&manager->lock);
4394 #endif /* USE_WATCHER_THREAD */
4395         }
4396
4397         UNLOCK(&manager->lock);
4398
4399         /*
4400          * Here, poke our select/poll thread.  Do this by closing the write
4401          * half of the pipe, which will send EOF to the read half.
4402          * This is currently a no-op in the non-threaded case.
4403          */
4404         select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
4405
4406 #ifdef USE_WATCHER_THREAD
4407         /*
4408          * Wait for thread to exit.
4409          */
4410         if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
4411                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4412                                  "isc_thread_join() %s",
4413                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4414                                                 ISC_MSG_FAILED, "failed"));
4415 #endif /* USE_WATCHER_THREAD */
4416
4417         /*
4418          * Clean up.
4419          */
4420         cleanup_watcher(manager->mctx, manager);
4421
4422 #ifdef USE_WATCHER_THREAD
4423         (void)close(manager->pipe_fds[0]);
4424         (void)close(manager->pipe_fds[1]);
4425         (void)isc_condition_destroy(&manager->shutdown_ok);
4426 #endif /* USE_WATCHER_THREAD */
4427
4428         for (i = 0; i < (int)manager->maxsocks; i++)
4429                 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
4430                         (void)close(i);
4431
4432         isc_mem_put(manager->mctx, manager->fds,
4433                     manager->maxsocks * sizeof(isc__socket_t *));
4434         isc_mem_put(manager->mctx, manager->fdstate,
4435                     manager->maxsocks * sizeof(int));
4436
4437         if (manager->stats != NULL)
4438                 isc_stats_detach(&manager->stats);
4439
4440         if (manager->fdlock != NULL) {
4441                 for (i = 0; i < FDLOCK_COUNT; i++)
4442                         DESTROYLOCK(&manager->fdlock[i]);
4443                 isc_mem_put(manager->mctx, manager->fdlock,
4444                             FDLOCK_COUNT * sizeof(isc_mutex_t));
4445         }
4446         DESTROYLOCK(&manager->lock);
4447         manager->common.magic = 0;
4448         manager->common.impmagic = 0;
4449         mctx= manager->mctx;
4450         isc_mem_put(mctx, manager, sizeof(*manager));
4451
4452         isc_mem_detach(&mctx);
4453
4454         *managerp = NULL;
4455
4456 #ifdef USE_SHARED_MANAGER
4457         socketmgr = NULL;
4458 #endif
4459 }
4460
4461 static isc_result_t
4462 socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4463             unsigned int flags)
4464 {
4465         int io_state;
4466         isc_boolean_t have_lock = ISC_FALSE;
4467         isc_task_t *ntask = NULL;
4468         isc_result_t result = ISC_R_SUCCESS;
4469
4470         dev->ev_sender = task;
4471
4472         if (sock->type == isc_sockettype_udp) {
4473                 io_state = doio_recv(sock, dev);
4474         } else {
4475                 LOCK(&sock->lock);
4476                 have_lock = ISC_TRUE;
4477
4478                 if (ISC_LIST_EMPTY(sock->recv_list))
4479                         io_state = doio_recv(sock, dev);
4480                 else
4481                         io_state = DOIO_SOFT;
4482         }
4483
4484         switch (io_state) {
4485         case DOIO_SOFT:
4486                 /*
4487                  * We couldn't read all or part of the request right now, so
4488                  * queue it.
4489                  *
4490                  * Attach to socket and to task
4491                  */
4492                 isc_task_attach(task, &ntask);
4493                 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4494
4495                 if (!have_lock) {
4496                         LOCK(&sock->lock);
4497                         have_lock = ISC_TRUE;
4498                 }
4499
4500                 /*
4501                  * Enqueue the request.  If the socket was previously not being
4502                  * watched, poke the watcher to start paying attention to it.
4503                  */
4504                 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
4505                         select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
4506                 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
4507
4508                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
4509                            "socket_recv: event %p -> task %p",
4510                            dev, ntask);
4511
4512                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4513                         result = ISC_R_INPROGRESS;
4514                 break;
4515
4516         case DOIO_EOF:
4517                 dev->result = ISC_R_EOF;
4518                 /* fallthrough */
4519
4520         case DOIO_HARD:
4521         case DOIO_SUCCESS:
4522                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4523                         send_recvdone_event(sock, &dev);
4524                 break;
4525         }
4526
4527         if (have_lock)
4528                 UNLOCK(&sock->lock);
4529
4530         return (result);
4531 }
4532
4533 ISC_SOCKETFUNC_SCOPE isc_result_t
4534 isc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
4535                   unsigned int minimum, isc_task_t *task,
4536                   isc_taskaction_t action, const void *arg)
4537 {
4538         isc__socket_t *sock = (isc__socket_t *)sock0;
4539         isc_socketevent_t *dev;
4540         isc__socketmgr_t *manager;
4541         unsigned int iocount;
4542         isc_buffer_t *buffer;
4543
4544         REQUIRE(VALID_SOCKET(sock));
4545         REQUIRE(buflist != NULL);
4546         REQUIRE(!ISC_LIST_EMPTY(*buflist));
4547         REQUIRE(task != NULL);
4548         REQUIRE(action != NULL);
4549
4550         manager = sock->manager;
4551         REQUIRE(VALID_MANAGER(manager));
4552
4553         iocount = isc_bufferlist_availablecount(buflist);
4554         REQUIRE(iocount > 0);
4555
4556         INSIST(sock->bound);
4557
4558         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4559         if (dev == NULL)
4560                 return (ISC_R_NOMEMORY);
4561
4562         /*
4563          * UDP sockets are always partial read
4564          */
4565         if (sock->type == isc_sockettype_udp)
4566                 dev->minimum = 1;
4567         else {
4568                 if (minimum == 0)
4569                         dev->minimum = iocount;
4570                 else
4571                         dev->minimum = minimum;
4572         }
4573
4574         /*
4575          * Move each buffer from the passed in list to our internal one.
4576          */
4577         buffer = ISC_LIST_HEAD(*buflist);
4578         while (buffer != NULL) {
4579                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
4580                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4581                 buffer = ISC_LIST_HEAD(*buflist);
4582         }
4583
4584         return (socket_recv(sock, dev, task, 0));
4585 }
4586
4587 ISC_SOCKETFUNC_SCOPE isc_result_t
4588 isc__socket_recv(isc_socket_t *sock0, isc_region_t *region,
4589                  unsigned int minimum, isc_task_t *task,
4590                  isc_taskaction_t action, const void *arg)
4591 {
4592         isc__socket_t *sock = (isc__socket_t *)sock0;
4593         isc_socketevent_t *dev;
4594         isc__socketmgr_t *manager;
4595
4596         REQUIRE(VALID_SOCKET(sock));
4597         REQUIRE(action != NULL);
4598
4599         manager = sock->manager;
4600         REQUIRE(VALID_MANAGER(manager));
4601
4602         INSIST(sock->bound);
4603
4604         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4605         if (dev == NULL)
4606                 return (ISC_R_NOMEMORY);
4607
4608         return (isc__socket_recv2(sock0, region, minimum, task, dev, 0));
4609 }
4610
4611 ISC_SOCKETFUNC_SCOPE isc_result_t
4612 isc__socket_recv2(isc_socket_t *sock0, isc_region_t *region,
4613                   unsigned int minimum, isc_task_t *task,
4614                   isc_socketevent_t *event, unsigned int flags)
4615 {
4616         isc__socket_t *sock = (isc__socket_t *)sock0;
4617
4618         event->ev_sender = sock;
4619         event->result = ISC_R_UNSET;
4620         ISC_LIST_INIT(event->bufferlist);
4621         event->region = *region;
4622         event->n = 0;
4623         event->offset = 0;
4624         event->attributes = 0;
4625
4626         /*
4627          * UDP sockets are always partial read.
4628          */
4629         if (sock->type == isc_sockettype_udp)
4630                 event->minimum = 1;
4631         else {
4632                 if (minimum == 0)
4633                         event->minimum = region->length;
4634                 else
4635                         event->minimum = minimum;
4636         }
4637
4638         return (socket_recv(sock, event, task, flags));
4639 }
4640
4641 static isc_result_t
4642 socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4643             isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4644             unsigned int flags)
4645 {
4646         int io_state;
4647         isc_boolean_t have_lock = ISC_FALSE;
4648         isc_task_t *ntask = NULL;
4649         isc_result_t result = ISC_R_SUCCESS;
4650
4651         dev->ev_sender = task;
4652
4653         set_dev_address(address, sock, dev);
4654         if (pktinfo != NULL) {
4655                 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4656                 dev->pktinfo = *pktinfo;
4657
4658                 if (!isc_sockaddr_issitelocal(&dev->address) &&
4659                     !isc_sockaddr_islinklocal(&dev->address)) {
4660                         socket_log(sock, NULL, TRACE, isc_msgcat,
4661                                    ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
4662                                    "pktinfo structure provided, ifindex %u "
4663                                    "(set to 0)", pktinfo->ipi6_ifindex);
4664
4665                         /*
4666                          * Set the pktinfo index to 0 here, to let the
4667                          * kernel decide what interface it should send on.
4668                          */
4669                         dev->pktinfo.ipi6_ifindex = 0;
4670                 }
4671         }
4672
4673         if (sock->type == isc_sockettype_udp)
4674                 io_state = doio_send(sock, dev);
4675         else {
4676                 LOCK(&sock->lock);
4677                 have_lock = ISC_TRUE;
4678
4679                 if (ISC_LIST_EMPTY(sock->send_list))
4680                         io_state = doio_send(sock, dev);
4681                 else
4682                         io_state = DOIO_SOFT;
4683         }
4684
4685         switch (io_state) {
4686         case DOIO_SOFT:
4687                 /*
4688                  * We couldn't send all or part of the request right now, so
4689                  * queue it unless ISC_SOCKFLAG_NORETRY is set.
4690                  */
4691                 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4692                         isc_task_attach(task, &ntask);
4693                         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4694
4695                         if (!have_lock) {
4696                                 LOCK(&sock->lock);
4697                                 have_lock = ISC_TRUE;
4698                         }
4699
4700                         /*
4701                          * Enqueue the request.  If the socket was previously
4702                          * not being watched, poke the watcher to start
4703                          * paying attention to it.
4704                          */
4705                         if (ISC_LIST_EMPTY(sock->send_list) &&
4706                             !sock->pending_send)
4707                                 select_poke(sock->manager, sock->fd,
4708                                             SELECT_POKE_WRITE);
4709                         ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4710
4711                         socket_log(sock, NULL, EVENT, NULL, 0, 0,
4712                                    "socket_send: event %p -> task %p",
4713                                    dev, ntask);
4714
4715                         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4716                                 result = ISC_R_INPROGRESS;
4717                         break;
4718                 }
4719
4720         case DOIO_HARD:
4721         case DOIO_SUCCESS:
4722                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4723                         send_senddone_event(sock, &dev);
4724                 break;
4725         }
4726
4727         if (have_lock)
4728                 UNLOCK(&sock->lock);
4729
4730         return (result);
4731 }
4732
4733 ISC_SOCKETFUNC_SCOPE isc_result_t
4734 isc__socket_send(isc_socket_t *sock, isc_region_t *region,
4735                  isc_task_t *task, isc_taskaction_t action, const void *arg)
4736 {
4737         /*
4738          * REQUIRE() checking is performed in isc_socket_sendto().
4739          */
4740         return (isc__socket_sendto(sock, region, task, action, arg, NULL,
4741                                    NULL));
4742 }
4743
4744 ISC_SOCKETFUNC_SCOPE isc_result_t
4745 isc__socket_sendto(isc_socket_t *sock0, isc_region_t *region,
4746                    isc_task_t *task, isc_taskaction_t action, const void *arg,
4747                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4748 {
4749         isc__socket_t *sock = (isc__socket_t *)sock0;
4750         isc_socketevent_t *dev;
4751         isc__socketmgr_t *manager;
4752
4753         REQUIRE(VALID_SOCKET(sock));
4754         REQUIRE(region != NULL);
4755         REQUIRE(task != NULL);
4756         REQUIRE(action != NULL);
4757
4758         manager = sock->manager;
4759         REQUIRE(VALID_MANAGER(manager));
4760
4761         INSIST(sock->bound);
4762
4763         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4764         if (dev == NULL)
4765                 return (ISC_R_NOMEMORY);
4766
4767         dev->region = *region;
4768
4769         return (socket_send(sock, dev, task, address, pktinfo, 0));
4770 }
4771
4772 ISC_SOCKETFUNC_SCOPE isc_result_t
4773 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
4774                   isc_task_t *task, isc_taskaction_t action, const void *arg)
4775 {
4776         return (isc__socket_sendtov(sock, buflist, task, action, arg, NULL,
4777                                     NULL));
4778 }
4779
4780 ISC_SOCKETFUNC_SCOPE isc_result_t
4781 isc__socket_sendtov(isc_socket_t *sock0, isc_bufferlist_t *buflist,
4782                     isc_task_t *task, isc_taskaction_t action, const void *arg,
4783                     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4784 {
4785         isc__socket_t *sock = (isc__socket_t *)sock0;
4786         isc_socketevent_t *dev;
4787         isc__socketmgr_t *manager;
4788         unsigned int iocount;
4789         isc_buffer_t *buffer;
4790
4791         REQUIRE(VALID_SOCKET(sock));
4792         REQUIRE(buflist != NULL);
4793         REQUIRE(!ISC_LIST_EMPTY(*buflist));
4794         REQUIRE(task != NULL);
4795         REQUIRE(action != NULL);
4796
4797         manager = sock->manager;
4798         REQUIRE(VALID_MANAGER(manager));
4799
4800         iocount = isc_bufferlist_usedcount(buflist);
4801         REQUIRE(iocount > 0);
4802
4803         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4804         if (dev == NULL)
4805                 return (ISC_R_NOMEMORY);
4806
4807         /*
4808          * Move each buffer from the passed in list to our internal one.
4809          */
4810         buffer = ISC_LIST_HEAD(*buflist);
4811         while (buffer != NULL) {
4812                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
4813                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4814                 buffer = ISC_LIST_HEAD(*buflist);
4815         }
4816
4817         return (socket_send(sock, dev, task, address, pktinfo, 0));
4818 }
4819
4820 ISC_SOCKETFUNC_SCOPE isc_result_t
4821 isc__socket_sendto2(isc_socket_t *sock0, isc_region_t *region,
4822                     isc_task_t *task,
4823                     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4824                     isc_socketevent_t *event, unsigned int flags)
4825 {
4826         isc__socket_t *sock = (isc__socket_t *)sock0;
4827
4828         REQUIRE(VALID_SOCKET(sock));
4829         REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
4830         if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
4831                 REQUIRE(sock->type == isc_sockettype_udp);
4832         event->ev_sender = sock;
4833         event->result = ISC_R_UNSET;
4834         ISC_LIST_INIT(event->bufferlist);
4835         event->region = *region;
4836         event->n = 0;
4837         event->offset = 0;
4838         event->attributes = 0;
4839
4840         return (socket_send(sock, event, task, address, pktinfo, flags));
4841 }
4842
4843 ISC_SOCKETFUNC_SCOPE void
4844 isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
4845 #ifdef ISC_PLATFORM_HAVESYSUNH
4846         int s;
4847         struct stat sb;
4848         char strbuf[ISC_STRERRORSIZE];
4849
4850         if (sockaddr->type.sa.sa_family != AF_UNIX)
4851                 return;
4852
4853 #ifndef S_ISSOCK
4854 #if defined(S_IFMT) && defined(S_IFSOCK)
4855 #define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
4856 #elif defined(_S_IFMT) && defined(S_IFSOCK)
4857 #define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
4858 #endif
4859 #endif
4860
4861 #ifndef S_ISFIFO
4862 #if defined(S_IFMT) && defined(S_IFIFO)
4863 #define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
4864 #elif defined(_S_IFMT) && defined(S_IFIFO)
4865 #define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
4866 #endif
4867 #endif
4868
4869 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4870 #error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4871 #endif
4872
4873 #ifndef S_ISFIFO
4874 #define S_ISFIFO(mode) 0
4875 #endif
4876
4877 #ifndef S_ISSOCK
4878 #define S_ISSOCK(mode) 0
4879 #endif
4880
4881         if (active) {
4882                 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4883                         isc__strerror(errno, strbuf, sizeof(strbuf));
4884                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4885                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4886                                       "isc_socket_cleanunix: stat(%s): %s",
4887                                       sockaddr->type.sunix.sun_path, strbuf);
4888                         return;
4889                 }
4890                 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4891                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4892                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4893                                       "isc_socket_cleanunix: %s: not a socket",
4894                                       sockaddr->type.sunix.sun_path);
4895                         return;
4896                 }
4897                 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4898                         isc__strerror(errno, strbuf, sizeof(strbuf));
4899                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4900                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4901                                       "isc_socket_cleanunix: unlink(%s): %s",
4902                                       sockaddr->type.sunix.sun_path, strbuf);
4903                 }
4904                 return;
4905         }
4906
4907         s = socket(AF_UNIX, SOCK_STREAM, 0);
4908         if (s < 0) {
4909                 isc__strerror(errno, strbuf, sizeof(strbuf));
4910                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4911                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4912                               "isc_socket_cleanunix: socket(%s): %s",
4913                               sockaddr->type.sunix.sun_path, strbuf);
4914                 return;
4915         }
4916
4917         if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4918                 switch (errno) {
4919                 case ENOENT:    /* We exited cleanly last time */
4920                         break;
4921                 default:
4922                         isc__strerror(errno, strbuf, sizeof(strbuf));
4923                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4924                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4925                                       "isc_socket_cleanunix: stat(%s): %s",
4926                                       sockaddr->type.sunix.sun_path, strbuf);
4927                         break;
4928                 }
4929                 goto cleanup;
4930         }
4931
4932         if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4933                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4934                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4935                               "isc_socket_cleanunix: %s: not a socket",
4936                               sockaddr->type.sunix.sun_path);
4937                 goto cleanup;
4938         }
4939
4940         if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
4941                     sizeof(sockaddr->type.sunix)) < 0) {
4942                 switch (errno) {
4943                 case ECONNREFUSED:
4944                 case ECONNRESET:
4945                         if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4946                                 isc__strerror(errno, strbuf, sizeof(strbuf));
4947                                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4948                                               ISC_LOGMODULE_SOCKET,
4949                                               ISC_LOG_WARNING,
4950                                               "isc_socket_cleanunix: "
4951                                               "unlink(%s): %s",
4952                                               sockaddr->type.sunix.sun_path,
4953                                               strbuf);
4954                         }
4955                         break;
4956                 default:
4957                         isc__strerror(errno, strbuf, sizeof(strbuf));
4958                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4959                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4960                                       "isc_socket_cleanunix: connect(%s): %s",
4961                                       sockaddr->type.sunix.sun_path, strbuf);
4962                         break;
4963                 }
4964         }
4965  cleanup:
4966         close(s);
4967 #else
4968         UNUSED(sockaddr);
4969         UNUSED(active);
4970 #endif
4971 }
4972
4973 ISC_SOCKETFUNC_SCOPE isc_result_t
4974 isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
4975                     isc_uint32_t owner, isc_uint32_t group)
4976 {
4977 #ifdef ISC_PLATFORM_HAVESYSUNH
4978         isc_result_t result = ISC_R_SUCCESS;
4979         char strbuf[ISC_STRERRORSIZE];
4980         char path[sizeof(sockaddr->type.sunix.sun_path)];
4981 #ifdef NEED_SECURE_DIRECTORY
4982         char *slash;
4983 #endif
4984
4985         REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4986         INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4987         strcpy(path, sockaddr->type.sunix.sun_path);
4988
4989 #ifdef NEED_SECURE_DIRECTORY
4990         slash = strrchr(path, '/');
4991         if (slash != NULL) {
4992                 if (slash != path)
4993                         *slash = '\0';
4994                 else
4995                         strcpy(path, "/");
4996         } else
4997                 strcpy(path, ".");
4998 #endif
4999
5000         if (chmod(path, perm) < 0) {
5001                 isc__strerror(errno, strbuf, sizeof(strbuf));
5002                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
5003                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
5004                               "isc_socket_permunix: chmod(%s, %d): %s",
5005                               path, perm, strbuf);
5006                 result = ISC_R_FAILURE;
5007         }
5008         if (chown(path, owner, group) < 0) {
5009                 isc__strerror(errno, strbuf, sizeof(strbuf));
5010                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
5011                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
5012                               "isc_socket_permunix: chown(%s, %d, %d): %s",
5013                               path, owner, group,
5014                               strbuf);
5015                 result = ISC_R_FAILURE;
5016         }
5017         return (result);
5018 #else
5019         UNUSED(sockaddr);
5020         UNUSED(perm);
5021         UNUSED(owner);
5022         UNUSED(group);
5023         return (ISC_R_NOTIMPLEMENTED);
5024 #endif
5025 }
5026
5027 ISC_SOCKETFUNC_SCOPE isc_result_t
5028 isc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
5029                  unsigned int options) {
5030         isc__socket_t *sock = (isc__socket_t *)sock0;
5031         char strbuf[ISC_STRERRORSIZE];
5032         int on = 1;
5033
5034         REQUIRE(VALID_SOCKET(sock));
5035
5036         LOCK(&sock->lock);
5037
5038         INSIST(!sock->bound);
5039         INSIST(!sock->dupped);
5040
5041         if (sock->pf != sockaddr->type.sa.sa_family) {
5042                 UNLOCK(&sock->lock);
5043                 return (ISC_R_FAMILYMISMATCH);
5044         }
5045
5046         /*
5047          * Only set SO_REUSEADDR when we want a specific port.
5048          */
5049 #ifdef AF_UNIX
5050         if (sock->pf == AF_UNIX)
5051                 goto bind_socket;
5052 #endif
5053         if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
5054             isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
5055             setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
5056                        sizeof(on)) < 0) {
5057                 UNEXPECTED_ERROR(__FILE__, __LINE__,
5058                                  "setsockopt(%d) %s", sock->fd,
5059                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
5060                                                 ISC_MSG_FAILED, "failed"));
5061                 /* Press on... */
5062         }
5063 #ifdef AF_UNIX
5064  bind_socket:
5065 #endif
5066         if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
5067                 inc_stats(sock->manager->stats,
5068                           sock->statsindex[STATID_BINDFAIL]);
5069
5070                 UNLOCK(&sock->lock);
5071                 switch (errno) {
5072                 case EACCES:
5073                         return (ISC_R_NOPERM);
5074                 case EADDRNOTAVAIL:
5075                         return (ISC_R_ADDRNOTAVAIL);
5076                 case EADDRINUSE:
5077                         return (ISC_R_ADDRINUSE);
5078                 case EINVAL:
5079                         return (ISC_R_BOUND);
5080                 default:
5081                         isc__strerror(errno, strbuf, sizeof(strbuf));
5082                         UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
5083                                          strbuf);
5084                         return (ISC_R_UNEXPECTED);
5085                 }
5086         }
5087
5088         socket_log(sock, sockaddr, TRACE,
5089                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
5090         sock->bound = 1;
5091
5092         UNLOCK(&sock->lock);
5093         return (ISC_R_SUCCESS);
5094 }
5095
5096 /*
5097  * Enable this only for specific OS versions, and only when they have repaired
5098  * their problems with it.  Until then, this is is broken and needs to be
5099  * diabled by default.  See RT22589 for details.
5100  */
5101 #undef ENABLE_ACCEPTFILTER
5102
5103 ISC_SOCKETFUNC_SCOPE isc_result_t
5104 isc__socket_filter(isc_socket_t *sock0, const char *filter) {
5105         isc__socket_t *sock = (isc__socket_t *)sock0;
5106 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
5107         char strbuf[ISC_STRERRORSIZE];
5108         struct accept_filter_arg afa;
5109 #else
5110         UNUSED(sock);
5111         UNUSED(filter);
5112 #endif
5113
5114         REQUIRE(VALID_SOCKET(sock));
5115
5116 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
5117         bzero(&afa, sizeof(afa));
5118         strncpy(afa.af_name, filter, sizeof(afa.af_name));
5119         if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
5120                          &afa, sizeof(afa)) == -1) {
5121                 isc__strerror(errno, strbuf, sizeof(strbuf));
5122                 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
5123                            ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
5124                            strbuf);
5125                 return (ISC_R_FAILURE);
5126         }
5127         return (ISC_R_SUCCESS);
5128 #else
5129         return (ISC_R_NOTIMPLEMENTED);
5130 #endif
5131 }
5132
5133 /*
5134  * Set up to listen on a given socket.  We do this by creating an internal
5135  * event that will be dispatched when the socket has read activity.  The
5136  * watcher will send the internal event to the task when there is a new
5137  * connection.
5138  *
5139  * Unlike in read, we don't preallocate a done event here.  Every time there
5140  * is a new connection we'll have to allocate a new one anyway, so we might
5141  * as well keep things simple rather than having to track them.
5142  */
5143 ISC_SOCKETFUNC_SCOPE isc_result_t
5144 isc__socket_listen(isc_socket_t *sock0, unsigned int backlog) {
5145         isc__socket_t *sock = (isc__socket_t *)sock0;
5146         char strbuf[ISC_STRERRORSIZE];
5147
5148         REQUIRE(VALID_SOCKET(sock));
5149
5150         LOCK(&sock->lock);
5151
5152         REQUIRE(!sock->listener);
5153         REQUIRE(sock->bound);
5154         REQUIRE(sock->type == isc_sockettype_tcp ||
5155                 sock->type == isc_sockettype_unix);
5156
5157         if (backlog == 0)
5158                 backlog = SOMAXCONN;
5159
5160         if (listen(sock->fd, (int)backlog) < 0) {
5161                 UNLOCK(&sock->lock);
5162                 isc__strerror(errno, strbuf, sizeof(strbuf));
5163
5164                 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
5165
5166                 return (ISC_R_UNEXPECTED);
5167         }
5168
5169         sock->listener = 1;
5170
5171         UNLOCK(&sock->lock);
5172         return (ISC_R_SUCCESS);
5173 }
5174
5175 /*
5176  * This should try to do aggressive accept() XXXMLG
5177  */
5178 ISC_SOCKETFUNC_SCOPE isc_result_t
5179 isc__socket_accept(isc_socket_t *sock0,
5180                   isc_task_t *task, isc_taskaction_t action, const void *arg)
5181 {
5182         isc__socket_t *sock = (isc__socket_t *)sock0;
5183         isc_socket_newconnev_t *dev;
5184         isc__socketmgr_t *manager;
5185         isc_task_t *ntask = NULL;
5186         isc__socket_t *nsock;
5187         isc_result_t result;
5188         isc_boolean_t do_poke = ISC_FALSE;
5189
5190         REQUIRE(VALID_SOCKET(sock));
5191         manager = sock->manager;
5192         REQUIRE(VALID_MANAGER(manager));
5193
5194         LOCK(&sock->lock);
5195
5196         REQUIRE(sock->listener);
5197
5198         /*
5199          * Sender field is overloaded here with the task we will be sending
5200          * this event to.  Just before the actual event is delivered the
5201          * actual ev_sender will be touched up to be the socket.
5202          */
5203         dev = (isc_socket_newconnev_t *)
5204                 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
5205                                    action, arg, sizeof(*dev));
5206         if (dev == NULL) {
5207                 UNLOCK(&sock->lock);
5208                 return (ISC_R_NOMEMORY);
5209         }
5210         ISC_LINK_INIT(dev, ev_link);
5211
5212         result = allocate_socket(manager, sock->type, &nsock);
5213         if (result != ISC_R_SUCCESS) {
5214                 isc_event_free(ISC_EVENT_PTR(&dev));
5215                 UNLOCK(&sock->lock);
5216                 return (result);
5217         }
5218
5219         /*
5220          * Attach to socket and to task.
5221          */
5222         isc_task_attach(task, &ntask);
5223         if (isc_task_exiting(ntask)) {
5224                 free_socket(&nsock);
5225                 isc_task_detach(&ntask);
5226                 isc_event_free(ISC_EVENT_PTR(&dev));
5227                 UNLOCK(&sock->lock);
5228                 return (ISC_R_SHUTTINGDOWN);
5229         }
5230         nsock->references++;
5231         nsock->statsindex = sock->statsindex;
5232
5233         dev->ev_sender = ntask;
5234         dev->newsocket = (isc_socket_t *)nsock;
5235
5236         /*
5237          * Poke watcher here.  We still have the socket locked, so there
5238          * is no race condition.  We will keep the lock for such a short
5239          * bit of time waking it up now or later won't matter all that much.
5240          */
5241         if (ISC_LIST_EMPTY(sock->accept_list))
5242                 do_poke = ISC_TRUE;
5243
5244         ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
5245
5246         if (do_poke)
5247                 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
5248
5249         UNLOCK(&sock->lock);
5250         return (ISC_R_SUCCESS);
5251 }
5252
5253 ISC_SOCKETFUNC_SCOPE isc_result_t
5254 isc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
5255                    isc_task_t *task, isc_taskaction_t action, const void *arg)
5256 {
5257         isc__socket_t *sock = (isc__socket_t *)sock0;
5258         isc_socket_connev_t *dev;
5259         isc_task_t *ntask = NULL;
5260         isc__socketmgr_t *manager;
5261         int cc;
5262         char strbuf[ISC_STRERRORSIZE];
5263         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
5264
5265         REQUIRE(VALID_SOCKET(sock));
5266         REQUIRE(addr != NULL);
5267         REQUIRE(task != NULL);
5268         REQUIRE(action != NULL);
5269
5270         manager = sock->manager;
5271         REQUIRE(VALID_MANAGER(manager));
5272         REQUIRE(addr != NULL);
5273
5274         if (isc_sockaddr_ismulticast(addr))
5275                 return (ISC_R_MULTICAST);
5276
5277         LOCK(&sock->lock);
5278
5279         REQUIRE(!sock->connecting);
5280
5281         dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
5282                                                         ISC_SOCKEVENT_CONNECT,
5283                                                         action, arg,
5284                                                         sizeof(*dev));
5285         if (dev == NULL) {
5286                 UNLOCK(&sock->lock);
5287                 return (ISC_R_NOMEMORY);
5288         }
5289         ISC_LINK_INIT(dev, ev_link);
5290
5291         /*
5292          * Try to do the connect right away, as there can be only one
5293          * outstanding, and it might happen to complete.
5294          */
5295         sock->peer_address = *addr;
5296         cc = connect(sock->fd, &addr->type.sa, addr->length);
5297         if (cc < 0) {
5298                 /*
5299                  * HP-UX "fails" to connect a UDP socket and sets errno to
5300                  * EINPROGRESS if it's non-blocking.  We'd rather regard this as
5301                  * a success and let the user detect it if it's really an error
5302                  * at the time of sending a packet on the socket.
5303                  */
5304                 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
5305                         cc = 0;
5306                         goto success;
5307                 }
5308                 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
5309                         goto queue;
5310
5311                 switch (errno) {
5312 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
5313                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
5314                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
5315                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
5316                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
5317                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
5318 #ifdef EHOSTDOWN
5319                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
5320 #endif
5321                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
5322                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
5323                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
5324                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
5325                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5326 #undef ERROR_MATCH
5327                 }
5328
5329                 sock->connected = 0;
5330
5331                 isc__strerror(errno, strbuf, sizeof(strbuf));
5332                 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
5333                 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
5334                                  addrbuf, errno, strbuf);
5335
5336                 UNLOCK(&sock->lock);
5337                 inc_stats(sock->manager->stats,
5338                           sock->statsindex[STATID_CONNECTFAIL]);
5339                 isc_event_free(ISC_EVENT_PTR(&dev));
5340                 return (ISC_R_UNEXPECTED);
5341
5342         err_exit:
5343                 sock->connected = 0;
5344                 isc_task_send(task, ISC_EVENT_PTR(&dev));
5345
5346                 UNLOCK(&sock->lock);
5347                 inc_stats(sock->manager->stats,
5348                           sock->statsindex[STATID_CONNECTFAIL]);
5349                 return (ISC_R_SUCCESS);
5350         }
5351
5352         /*
5353          * If connect completed, fire off the done event.
5354          */
5355  success:
5356         if (cc == 0) {
5357                 sock->connected = 1;
5358                 sock->bound = 1;
5359                 dev->result = ISC_R_SUCCESS;
5360                 isc_task_send(task, ISC_EVENT_PTR(&dev));
5361
5362                 UNLOCK(&sock->lock);
5363
5364                 inc_stats(sock->manager->stats,
5365                           sock->statsindex[STATID_CONNECT]);
5366
5367                 return (ISC_R_SUCCESS);
5368         }
5369
5370  queue:
5371
5372         /*
5373          * Attach to task.
5374          */
5375         isc_task_attach(task, &ntask);
5376
5377         sock->connecting = 1;
5378
5379         dev->ev_sender = ntask;
5380
5381         /*
5382          * Poke watcher here.  We still have the socket locked, so there
5383          * is no race condition.  We will keep the lock for such a short
5384          * bit of time waking it up now or later won't matter all that much.
5385          */
5386         if (sock->connect_ev == NULL)
5387                 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
5388
5389         sock->connect_ev = dev;
5390
5391         UNLOCK(&sock->lock);
5392         return (ISC_R_SUCCESS);
5393 }
5394
5395 /*
5396  * Called when a socket with a pending connect() finishes.
5397  */
5398 static void
5399 internal_connect(isc_task_t *me, isc_event_t *ev) {
5400         isc__socket_t *sock;
5401         isc_socket_connev_t *dev;
5402         isc_task_t *task;
5403         int cc;
5404         ISC_SOCKADDR_LEN_T optlen;
5405         char strbuf[ISC_STRERRORSIZE];
5406         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5407
5408         UNUSED(me);
5409         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
5410
5411         sock = ev->ev_sender;
5412         INSIST(VALID_SOCKET(sock));
5413
5414         LOCK(&sock->lock);
5415
5416         /*
5417          * When the internal event was sent the reference count was bumped
5418          * to keep the socket around for us.  Decrement the count here.
5419          */
5420         INSIST(sock->references > 0);
5421         sock->references--;
5422         if (sock->references == 0) {
5423                 UNLOCK(&sock->lock);
5424                 destroy(&sock);
5425                 return;
5426         }
5427
5428         /*
5429          * Has this event been canceled?
5430          */
5431         dev = sock->connect_ev;
5432         if (dev == NULL) {
5433                 INSIST(!sock->connecting);
5434                 UNLOCK(&sock->lock);
5435                 return;
5436         }
5437
5438         INSIST(sock->connecting);
5439         sock->connecting = 0;
5440
5441         /*
5442          * Get any possible error status here.
5443          */
5444         optlen = sizeof(cc);
5445         if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
5446                        (void *)&cc, (void *)&optlen) < 0)
5447                 cc = errno;
5448         else
5449                 errno = cc;
5450
5451         if (errno != 0) {
5452                 /*
5453                  * If the error is EAGAIN, just re-select on this
5454                  * fd and pretend nothing strange happened.
5455                  */
5456                 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
5457                         sock->connecting = 1;
5458                         select_poke(sock->manager, sock->fd,
5459                                     SELECT_POKE_CONNECT);
5460                         UNLOCK(&sock->lock);
5461
5462                         return;
5463                 }
5464
5465                 inc_stats(sock->manager->stats,
5466                           sock->statsindex[STATID_CONNECTFAIL]);
5467
5468                 /*
5469                  * Translate other errors into ISC_R_* flavors.
5470                  */
5471                 switch (errno) {
5472 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
5473                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
5474                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
5475                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
5476                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
5477                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
5478 #ifdef EHOSTDOWN
5479                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
5480 #endif
5481                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
5482                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
5483                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
5484                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
5485                         ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
5486                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5487 #undef ERROR_MATCH
5488                 default:
5489                         dev->result = ISC_R_UNEXPECTED;
5490                         isc_sockaddr_format(&sock->peer_address, peerbuf,
5491                                             sizeof(peerbuf));
5492                         isc__strerror(errno, strbuf, sizeof(strbuf));
5493                         UNEXPECTED_ERROR(__FILE__, __LINE__,
5494                                          "internal_connect: connect(%s) %s",
5495                                          peerbuf, strbuf);
5496                 }
5497         } else {
5498                 inc_stats(sock->manager->stats,
5499                           sock->statsindex[STATID_CONNECT]);
5500                 dev->result = ISC_R_SUCCESS;
5501                 sock->connected = 1;
5502                 sock->bound = 1;
5503         }
5504
5505         sock->connect_ev = NULL;
5506
5507         UNLOCK(&sock->lock);
5508
5509         task = dev->ev_sender;
5510         dev->ev_sender = sock;
5511         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
5512 }
5513
5514 ISC_SOCKETFUNC_SCOPE isc_result_t
5515 isc__socket_getpeername(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
5516         isc__socket_t *sock = (isc__socket_t *)sock0;
5517         isc_result_t result;
5518
5519         REQUIRE(VALID_SOCKET(sock));
5520         REQUIRE(addressp != NULL);
5521
5522         LOCK(&sock->lock);
5523
5524         if (sock->connected) {
5525                 *addressp = sock->peer_address;
5526                 result = ISC_R_SUCCESS;
5527         } else {
5528                 result = ISC_R_NOTCONNECTED;
5529         }
5530
5531         UNLOCK(&sock->lock);
5532
5533         return (result);
5534 }
5535
5536 ISC_SOCKETFUNC_SCOPE isc_result_t
5537 isc__socket_getsockname(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
5538         isc__socket_t *sock = (isc__socket_t *)sock0;
5539         ISC_SOCKADDR_LEN_T len;
5540         isc_result_t result;
5541         char strbuf[ISC_STRERRORSIZE];
5542
5543         REQUIRE(VALID_SOCKET(sock));
5544         REQUIRE(addressp != NULL);
5545
5546         LOCK(&sock->lock);
5547
5548         if (!sock->bound) {
5549                 result = ISC_R_NOTBOUND;
5550                 goto out;
5551         }
5552
5553         result = ISC_R_SUCCESS;
5554
5555         len = sizeof(addressp->type);
5556         if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
5557                 isc__strerror(errno, strbuf, sizeof(strbuf));
5558                 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
5559                                  strbuf);
5560                 result = ISC_R_UNEXPECTED;
5561                 goto out;
5562         }
5563         addressp->length = (unsigned int)len;
5564
5565  out:
5566         UNLOCK(&sock->lock);
5567
5568         return (result);
5569 }
5570
5571 /*
5572  * Run through the list of events on this socket, and cancel the ones
5573  * queued for task "task" of type "how".  "how" is a bitmask.
5574  */
5575 ISC_SOCKETFUNC_SCOPE void
5576 isc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
5577         isc__socket_t *sock = (isc__socket_t *)sock0;
5578
5579         REQUIRE(VALID_SOCKET(sock));
5580
5581         /*
5582          * Quick exit if there is nothing to do.  Don't even bother locking
5583          * in this case.
5584          */
5585         if (how == 0)
5586                 return;
5587
5588         LOCK(&sock->lock);
5589
5590         /*
5591          * All of these do the same thing, more or less.
5592          * Each will:
5593          *      o If the internal event is marked as "posted" try to
5594          *        remove it from the task's queue.  If this fails, mark it
5595          *        as canceled instead, and let the task clean it up later.
5596          *      o For each I/O request for that task of that type, post
5597          *        its done event with status of "ISC_R_CANCELED".
5598          *      o Reset any state needed.
5599          */
5600         if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
5601             && !ISC_LIST_EMPTY(sock->recv_list)) {
5602                 isc_socketevent_t      *dev;
5603                 isc_socketevent_t      *next;
5604                 isc_task_t             *current_task;
5605
5606                 dev = ISC_LIST_HEAD(sock->recv_list);
5607
5608                 while (dev != NULL) {
5609                         current_task = dev->ev_sender;
5610                         next = ISC_LIST_NEXT(dev, ev_link);
5611
5612                         if ((task == NULL) || (task == current_task)) {
5613                                 dev->result = ISC_R_CANCELED;
5614                                 send_recvdone_event(sock, &dev);
5615                         }
5616                         dev = next;
5617                 }
5618         }
5619
5620         if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
5621             && !ISC_LIST_EMPTY(sock->send_list)) {
5622                 isc_socketevent_t      *dev;
5623                 isc_socketevent_t      *next;
5624                 isc_task_t             *current_task;
5625
5626                 dev = ISC_LIST_HEAD(sock->send_list);
5627
5628                 while (dev != NULL) {
5629                         current_task = dev->ev_sender;
5630                         next = ISC_LIST_NEXT(dev, ev_link);
5631
5632                         if ((task == NULL) || (task == current_task)) {
5633                                 dev->result = ISC_R_CANCELED;
5634                                 send_senddone_event(sock, &dev);
5635                         }
5636                         dev = next;
5637                 }
5638         }
5639
5640         if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
5641             && !ISC_LIST_EMPTY(sock->accept_list)) {
5642                 isc_socket_newconnev_t *dev;
5643                 isc_socket_newconnev_t *next;
5644                 isc_task_t             *current_task;
5645
5646                 dev = ISC_LIST_HEAD(sock->accept_list);
5647                 while (dev != NULL) {
5648                         current_task = dev->ev_sender;
5649                         next = ISC_LIST_NEXT(dev, ev_link);
5650
5651                         if ((task == NULL) || (task == current_task)) {
5652
5653                                 ISC_LIST_UNLINK(sock->accept_list, dev,
5654                                                 ev_link);
5655
5656                                 NEWCONNSOCK(dev)->references--;
5657                                 free_socket((isc__socket_t **)&dev->newsocket);
5658
5659                                 dev->result = ISC_R_CANCELED;
5660                                 dev->ev_sender = sock;
5661                                 isc_task_sendanddetach(&current_task,
5662                                                        ISC_EVENT_PTR(&dev));
5663                         }
5664
5665                         dev = next;
5666                 }
5667         }
5668
5669         /*
5670          * Connecting is not a list.
5671          */
5672         if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
5673             && sock->connect_ev != NULL) {
5674                 isc_socket_connev_t    *dev;
5675                 isc_task_t             *current_task;
5676
5677                 INSIST(sock->connecting);
5678                 sock->connecting = 0;
5679
5680                 dev = sock->connect_ev;
5681                 current_task = dev->ev_sender;
5682
5683                 if ((task == NULL) || (task == current_task)) {
5684                         sock->connect_ev = NULL;
5685
5686                         dev->result = ISC_R_CANCELED;
5687                         dev->ev_sender = sock;
5688                         isc_task_sendanddetach(&current_task,
5689                                                ISC_EVENT_PTR(&dev));
5690                 }
5691         }
5692
5693         UNLOCK(&sock->lock);
5694 }
5695
5696 ISC_SOCKETFUNC_SCOPE isc_sockettype_t
5697 isc__socket_gettype(isc_socket_t *sock0) {
5698         isc__socket_t *sock = (isc__socket_t *)sock0;
5699
5700         REQUIRE(VALID_SOCKET(sock));
5701
5702         return (sock->type);
5703 }
5704
5705 ISC_SOCKETFUNC_SCOPE isc_boolean_t
5706 isc__socket_isbound(isc_socket_t *sock0) {
5707         isc__socket_t *sock = (isc__socket_t *)sock0;
5708         isc_boolean_t val;
5709
5710         REQUIRE(VALID_SOCKET(sock));
5711
5712         LOCK(&sock->lock);
5713         val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
5714         UNLOCK(&sock->lock);
5715
5716         return (val);
5717 }
5718
5719 ISC_SOCKETFUNC_SCOPE void
5720 isc__socket_ipv6only(isc_socket_t *sock0, isc_boolean_t yes) {
5721         isc__socket_t *sock = (isc__socket_t *)sock0;
5722 #if defined(IPV6_V6ONLY)
5723         int onoff = yes ? 1 : 0;
5724 #else
5725         UNUSED(yes);
5726         UNUSED(sock);
5727 #endif
5728
5729         REQUIRE(VALID_SOCKET(sock));
5730         INSIST(!sock->dupped);
5731
5732 #ifdef IPV6_V6ONLY
5733         if (sock->pf == AF_INET6) {
5734                 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5735                                (void *)&onoff, sizeof(int)) < 0) {
5736                         char strbuf[ISC_STRERRORSIZE];
5737                         isc__strerror(errno, strbuf, sizeof(strbuf));
5738                         UNEXPECTED_ERROR(__FILE__, __LINE__,
5739                                          "setsockopt(%d, IPV6_V6ONLY) "
5740                                          "%s: %s", sock->fd,
5741                                          isc_msgcat_get(isc_msgcat,
5742                                                         ISC_MSGSET_GENERAL,
5743                                                         ISC_MSG_FAILED,
5744                                                         "failed"),
5745                                          strbuf);
5746                 }
5747         }
5748         FIX_IPV6_RECVPKTINFO(sock);     /* AIX */
5749 #endif
5750 }
5751
5752 #ifndef USE_WATCHER_THREAD
5753 /*
5754  * In our assumed scenario, we can simply use a single static object.
5755  * XXX: this is not true if the application uses multiple threads with
5756  *      'multi-context' mode.  Fixing this is a future TODO item.
5757  */
5758 static isc_socketwait_t swait_private;
5759
5760 int
5761 isc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
5762                           isc_socketwait_t **swaitp)
5763 {
5764         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
5765
5766
5767         int n;
5768 #ifdef USE_KQUEUE
5769         struct timespec ts, *tsp;
5770 #endif
5771 #ifdef USE_EPOLL
5772         int timeout;
5773 #endif
5774 #ifdef USE_DEVPOLL
5775         struct dvpoll dvp;
5776 #endif
5777
5778         REQUIRE(swaitp != NULL && *swaitp == NULL);
5779
5780 #ifdef USE_SHARED_MANAGER
5781         if (manager == NULL)
5782                 manager = socketmgr;
5783 #endif
5784         if (manager == NULL)
5785                 return (0);
5786
5787 #ifdef USE_KQUEUE
5788         if (tvp != NULL) {
5789                 ts.tv_sec = tvp->tv_sec;
5790                 ts.tv_nsec = tvp->tv_usec * 1000;
5791                 tsp = &ts;
5792         } else
5793                 tsp = NULL;
5794         swait_private.nevents = kevent(manager->kqueue_fd, NULL, 0,
5795                                        manager->events, manager->nevents,
5796                                        tsp);
5797         n = swait_private.nevents;
5798 #elif defined(USE_EPOLL)
5799         if (tvp != NULL)
5800                 timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000;
5801         else
5802                 timeout = -1;
5803         swait_private.nevents = epoll_wait(manager->epoll_fd,
5804                                            manager->events,
5805                                            manager->nevents, timeout);
5806         n = swait_private.nevents;
5807 #elif defined(USE_DEVPOLL)
5808         dvp.dp_fds = manager->events;
5809         dvp.dp_nfds = manager->nevents;
5810         if (tvp != NULL) {
5811                 dvp.dp_timeout = tvp->tv_sec * 1000 +
5812                         (tvp->tv_usec + 999) / 1000;
5813         } else
5814                 dvp.dp_timeout = -1;
5815         swait_private.nevents = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
5816         n = swait_private.nevents;
5817 #elif defined(USE_SELECT)
5818         memcpy(manager->read_fds_copy, manager->read_fds,  manager->fd_bufsize);
5819         memcpy(manager->write_fds_copy, manager->write_fds,
5820                manager->fd_bufsize);
5821
5822         swait_private.readset = manager->read_fds_copy;
5823         swait_private.writeset = manager->write_fds_copy;
5824         swait_private.maxfd = manager->maxfd + 1;
5825
5826         n = select(swait_private.maxfd, swait_private.readset,
5827                    swait_private.writeset, NULL, tvp);
5828 #endif
5829
5830         *swaitp = &swait_private;
5831         return (n);
5832 }
5833
5834 isc_result_t
5835 isc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
5836         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
5837
5838         REQUIRE(swait == &swait_private);
5839
5840 #ifdef USE_SHARED_MANAGER
5841         if (manager == NULL)
5842                 manager = socketmgr;
5843 #endif
5844         if (manager == NULL)
5845                 return (ISC_R_NOTFOUND);
5846
5847 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
5848         (void)process_fds(manager, manager->events, swait->nevents);
5849         return (ISC_R_SUCCESS);
5850 #elif defined(USE_SELECT)
5851         process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
5852         return (ISC_R_SUCCESS);
5853 #endif
5854 }
5855 #endif /* USE_WATCHER_THREAD */
5856
5857 #ifdef BIND9
5858 void
5859 isc__socket_setname(isc_socket_t *socket0, const char *name, void *tag) {
5860         isc__socket_t *socket = (isc__socket_t *)socket0;
5861
5862         /*
5863          * Name 'socket'.
5864          */
5865
5866         REQUIRE(VALID_SOCKET(socket));
5867
5868         LOCK(&socket->lock);
5869         memset(socket->name, 0, sizeof(socket->name));
5870         strncpy(socket->name, name, sizeof(socket->name) - 1);
5871         socket->tag = tag;
5872         UNLOCK(&socket->lock);
5873 }
5874
5875 ISC_SOCKETFUNC_SCOPE const char *
5876 isc__socket_getname(isc_socket_t *socket0) {
5877         isc__socket_t *socket = (isc__socket_t *)socket0;
5878
5879         return (socket->name);
5880 }
5881
5882 void *
5883 isc__socket_gettag(isc_socket_t *socket0) {
5884         isc__socket_t *socket = (isc__socket_t *)socket0;
5885
5886         return (socket->tag);
5887 }
5888 #endif  /* BIND9 */
5889
5890 #ifdef USE_SOCKETIMPREGISTER
5891 isc_result_t
5892 isc__socket_register() {
5893         return (isc_socket_register(isc__socketmgr_create));
5894 }
5895 #endif
5896
5897 ISC_SOCKETFUNC_SCOPE int
5898 isc__socket_getfd(isc_socket_t *socket0) {
5899         isc__socket_t *socket = (isc__socket_t *)socket0;
5900
5901         return ((short) socket->fd);
5902 }
5903
5904 #if defined(HAVE_LIBXML2) && defined(BIND9)
5905
5906 static const char *
5907 _socktype(isc_sockettype_t type)
5908 {
5909         if (type == isc_sockettype_udp)
5910                 return ("udp");
5911         else if (type == isc_sockettype_tcp)
5912                 return ("tcp");
5913         else if (type == isc_sockettype_unix)
5914                 return ("unix");
5915         else if (type == isc_sockettype_fdwatch)
5916                 return ("fdwatch");
5917         else
5918                 return ("not-initialized");
5919 }
5920
5921 ISC_SOCKETFUNC_SCOPE void
5922 isc_socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer) {
5923         isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0;
5924         isc__socket_t *sock;
5925         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5926         isc_sockaddr_t addr;
5927         ISC_SOCKADDR_LEN_T len;
5928
5929         LOCK(&mgr->lock);
5930
5931 #ifdef USE_SHARED_MANAGER
5932         xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5933         xmlTextWriterWriteFormatString(writer, "%d", mgr->refs);
5934         xmlTextWriterEndElement(writer);
5935 #endif  /* USE_SHARED_MANAGER */
5936
5937         xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets");
5938         sock = ISC_LIST_HEAD(mgr->socklist);
5939         while (sock != NULL) {
5940                 LOCK(&sock->lock);
5941                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket");
5942
5943                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "id");
5944                 xmlTextWriterWriteFormatString(writer, "%p", sock);
5945                 xmlTextWriterEndElement(writer);
5946
5947                 if (sock->name[0] != 0) {
5948                         xmlTextWriterStartElement(writer, ISC_XMLCHAR "name");
5949                         xmlTextWriterWriteFormatString(writer, "%s",
5950                                                        sock->name);
5951                         xmlTextWriterEndElement(writer); /* name */
5952                 }
5953
5954                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5955                 xmlTextWriterWriteFormatString(writer, "%d", sock->references);
5956                 xmlTextWriterEndElement(writer);
5957
5958                 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
5959                                           ISC_XMLCHAR _socktype(sock->type));
5960
5961                 if (sock->connected) {
5962                         isc_sockaddr_format(&sock->peer_address, peerbuf,
5963                                             sizeof(peerbuf));
5964                         xmlTextWriterWriteElement(writer,
5965                                                   ISC_XMLCHAR "peer-address",
5966                                                   ISC_XMLCHAR peerbuf);
5967                 }
5968
5969                 len = sizeof(addr);
5970                 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5971                         isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5972                         xmlTextWriterWriteElement(writer,
5973                                                   ISC_XMLCHAR "local-address",
5974                                                   ISC_XMLCHAR peerbuf);
5975                 }
5976
5977                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "states");
5978                 if (sock->pending_recv)
5979                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5980                                                 ISC_XMLCHAR "pending-receive");
5981                 if (sock->pending_send)
5982                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5983                                                   ISC_XMLCHAR "pending-send");
5984                 if (sock->pending_accept)
5985                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5986                                                  ISC_XMLCHAR "pending_accept");
5987                 if (sock->listener)
5988                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5989                                                   ISC_XMLCHAR "listener");
5990                 if (sock->connected)
5991                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5992                                                   ISC_XMLCHAR "connected");
5993                 if (sock->connecting)
5994                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5995                                                   ISC_XMLCHAR "connecting");
5996                 if (sock->bound)
5997                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5998                                                   ISC_XMLCHAR "bound");
5999
6000                 xmlTextWriterEndElement(writer); /* states */
6001
6002                 xmlTextWriterEndElement(writer); /* socket */
6003
6004                 UNLOCK(&sock->lock);
6005                 sock = ISC_LIST_NEXT(sock, link);
6006         }
6007         xmlTextWriterEndElement(writer); /* sockets */
6008
6009         UNLOCK(&mgr->lock);
6010 }
6011 #endif /* HAVE_LIBXML2 */