]> CyberLeo.Net >> Repos - FreeBSD/releng/8.1.git/blob - contrib/bind9/lib/isc/unix/socket.c
Copy stable/8 to releng/8.1 in preparation for 8.1-RC1.
[FreeBSD/releng/8.1.git] / contrib / bind9 / lib / isc / unix / socket.c
1 /*
2  * Copyright (C) 2004-2010  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1998-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: socket.c,v 1.308.12.12 2010/01/31 23:47:31 tbox Exp $ */
19
20 /*! \file */
21
22 #include <config.h>
23
24 #include <sys/param.h>
25 #include <sys/types.h>
26 #include <sys/socket.h>
27 #include <sys/stat.h>
28 #include <sys/time.h>
29 #include <sys/uio.h>
30
31 #include <errno.h>
32 #include <fcntl.h>
33 #include <stddef.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37
38 #include <isc/buffer.h>
39 #include <isc/bufferlist.h>
40 #include <isc/condition.h>
41 #include <isc/formatcheck.h>
42 #include <isc/list.h>
43 #include <isc/log.h>
44 #include <isc/mem.h>
45 #include <isc/msgs.h>
46 #include <isc/mutex.h>
47 #include <isc/net.h>
48 #include <isc/once.h>
49 #include <isc/platform.h>
50 #include <isc/print.h>
51 #include <isc/region.h>
52 #include <isc/socket.h>
53 #include <isc/stats.h>
54 #include <isc/strerror.h>
55 #include <isc/task.h>
56 #include <isc/thread.h>
57 #include <isc/util.h>
58 #include <isc/xml.h>
59
60 #ifdef ISC_PLATFORM_HAVESYSUNH
61 #include <sys/un.h>
62 #endif
63 #ifdef ISC_PLATFORM_HAVEKQUEUE
64 #include <sys/event.h>
65 #endif
66 #ifdef ISC_PLATFORM_HAVEEPOLL
67 #include <sys/epoll.h>
68 #endif
69 #ifdef ISC_PLATFORM_HAVEDEVPOLL
70 #include <sys/devpoll.h>
71 #endif
72
73 #include "errno2result.h"
74
75 #ifndef ISC_PLATFORM_USETHREADS
76 #include "socket_p.h"
77 #endif /* ISC_PLATFORM_USETHREADS */
78
79 #if defined(SO_BSDCOMPAT) && defined(__linux__)
80 #include <sys/utsname.h>
81 #endif
82
83 /*%
84  * Choose the most preferable multiplex method.
85  */
86 #ifdef ISC_PLATFORM_HAVEKQUEUE
87 #define USE_KQUEUE
88 #elif defined (ISC_PLATFORM_HAVEEPOLL)
89 #define USE_EPOLL
90 #elif defined (ISC_PLATFORM_HAVEDEVPOLL)
91 #define USE_DEVPOLL
92 typedef struct {
93         unsigned int want_read : 1,
94                 want_write : 1;
95 } pollinfo_t;
96 #else
97 #define USE_SELECT
98 #endif  /* ISC_PLATFORM_HAVEKQUEUE */
99
100 #ifndef ISC_PLATFORM_USETHREADS
101 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
102 struct isc_socketwait {
103         int nevents;
104 };
105 #elif defined (USE_SELECT)
106 struct isc_socketwait {
107         fd_set *readset;
108         fd_set *writeset;
109         int nfds;
110         int maxfd;
111 };
112 #endif  /* USE_KQUEUE */
113 #endif /* !ISC_PLATFORM_USETHREADS */
114
115 /*%
116  * Maximum number of allowable open sockets.  This is also the maximum
117  * allowable socket file descriptor.
118  *
119  * Care should be taken before modifying this value for select():
120  * The API standard doesn't ensure select() accept more than (the system default
121  * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
122  * the vast majority of cases.  This constant should therefore be increased only
123  * when absolutely necessary and possible, i.e., the server is exhausting all
124  * available file descriptors (up to FD_SETSIZE) and the select() function
125  * and FD_xxx macros support larger values than FD_SETSIZE (which may not
126  * always by true, but we keep using some of them to ensure as much
127  * portability as possible).  Note also that overall server performance
128  * may be rather worsened with a larger value of this constant due to
129  * inherent scalability problems of select().
130  *
131  * As a special note, this value shouldn't have to be touched if
132  * this is a build for an authoritative only DNS server.
133  */
134 #ifndef ISC_SOCKET_MAXSOCKETS
135 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
136 #define ISC_SOCKET_MAXSOCKETS 4096
137 #elif defined(USE_SELECT)
138 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
139 #endif  /* USE_KQUEUE... */
140 #endif  /* ISC_SOCKET_MAXSOCKETS */
141
142 #ifdef USE_SELECT
143 /*%
144  * Mac OS X needs a special definition to support larger values in select().
145  * We always define this because a larger value can be specified run-time.
146  */
147 #ifdef __APPLE__
148 #define _DARWIN_UNLIMITED_SELECT
149 #endif  /* __APPLE__ */
150 #endif  /* USE_SELECT */
151
152 #ifdef ISC_SOCKET_USE_POLLWATCH
153 /*%
154  * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
155  * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
156  * some of the specified FD.  The idea is based on the observation that it's
157  * likely for a busy server to keep receiving packets.  It specifically works
158  * as follows: the socket watcher is first initialized with the state of
159  * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
160  * event occurs.  When it wakes up for a socket I/O event, it moves to the
161  * poll_active state, and sets the poll timeout to a short period
162  * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
163  * watcher goes to the poll_checking state with the same timeout period.
164  * In this state, the watcher tries to detect whether this is a break
165  * during intermittent events or the kernel bug is triggered.  If the next
166  * polling reports an event within the short period, the previous timeout is
167  * likely to be a kernel bug, and so the watcher goes back to the active state.
168  * Otherwise, it moves to the idle state again.
169  *
170  * It's not clear whether this is a thread-related bug, but since we've only
171  * seen this with threads, this workaround is used only when enabling threads.
172  */
173
174 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
175
176 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
177 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
178 #endif  /* ISC_SOCKET_POLLWATCH_TIMEOUT */
179 #endif  /* ISC_SOCKET_USE_POLLWATCH */
180
181 /*%
182  * Size of per-FD lock buckets.
183  */
184 #ifdef ISC_PLATFORM_USETHREADS
185 #define FDLOCK_COUNT            1024
186 #define FDLOCK_ID(fd)           ((fd) % FDLOCK_COUNT)
187 #else
188 #define FDLOCK_COUNT            1
189 #define FDLOCK_ID(fd)           0
190 #endif  /* ISC_PLATFORM_USETHREADS */
191
192 /*%
193  * Maximum number of events communicated with the kernel.  There should normally
194  * be no need for having a large number.
195  */
196 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
197 #ifndef ISC_SOCKET_MAXEVENTS
198 #define ISC_SOCKET_MAXEVENTS    64
199 #endif
200 #endif
201
202 /*%
203  * Some systems define the socket length argument as an int, some as size_t,
204  * some as socklen_t.  This is here so it can be easily changed if needed.
205  */
206 #ifndef ISC_SOCKADDR_LEN_T
207 #define ISC_SOCKADDR_LEN_T unsigned int
208 #endif
209
210 /*%
211  * Define what the possible "soft" errors can be.  These are non-fatal returns
212  * of various network related functions, like recv() and so on.
213  *
214  * For some reason, BSDI (and perhaps others) will sometimes return <0
215  * from recv() but will have errno==0.  This is broken, but we have to
216  * work around it here.
217  */
218 #define SOFT_ERROR(e)   ((e) == EAGAIN || \
219                          (e) == EWOULDBLOCK || \
220                          (e) == EINTR || \
221                          (e) == 0)
222
223 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
224
225 /*!<
226  * DLVL(90)  --  Function entry/exit and other tracing.
227  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
228  * DLVL(60)  --  Socket data send/receive
229  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
230  * DLVL(20)  --  Socket creation/destruction.
231  */
232 #define TRACE_LEVEL             90
233 #define CORRECTNESS_LEVEL       70
234 #define IOEVENT_LEVEL           60
235 #define EVENT_LEVEL             50
236 #define CREATION_LEVEL          20
237
238 #define TRACE           DLVL(TRACE_LEVEL)
239 #define CORRECTNESS     DLVL(CORRECTNESS_LEVEL)
240 #define IOEVENT         DLVL(IOEVENT_LEVEL)
241 #define EVENT           DLVL(EVENT_LEVEL)
242 #define CREATION        DLVL(CREATION_LEVEL)
243
244 typedef isc_event_t intev_t;
245
246 #define SOCKET_MAGIC            ISC_MAGIC('I', 'O', 'i', 'o')
247 #define VALID_SOCKET(t)         ISC_MAGIC_VALID(t, SOCKET_MAGIC)
248
249 /*!
250  * IPv6 control information.  If the socket is an IPv6 socket we want
251  * to collect the destination address and interface so the client can
252  * set them on outgoing packets.
253  */
254 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
255 #ifndef USE_CMSG
256 #define USE_CMSG        1
257 #endif
258 #endif
259
260 /*%
261  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
262  * a setsockopt() like interface to request timestamps, and if the OS
263  * doesn't do it for us, call gettimeofday() on every UDP receive?
264  */
265 #ifdef SO_TIMESTAMP
266 #ifndef USE_CMSG
267 #define USE_CMSG        1
268 #endif
269 #endif
270
271 /*%
272  * The size to raise the receive buffer to (from BIND 8).
273  */
274 #define RCVBUFSIZE (32*1024)
275
276 /*%
277  * The number of times a send operation is repeated if the result is EINTR.
278  */
279 #define NRETRIES 10
280
281 struct isc_socket {
282         /* Not locked. */
283         unsigned int            magic;
284         isc_socketmgr_t        *manager;
285         isc_mutex_t             lock;
286         isc_sockettype_t        type;
287         const isc_statscounter_t        *statsindex;
288
289         /* Locked by socket lock. */
290         ISC_LINK(isc_socket_t)  link;
291         unsigned int            references;
292         int                     fd;
293         int                     pf;
294         char                            name[16];
295         void *                          tag;
296
297         ISC_LIST(isc_socketevent_t)             send_list;
298         ISC_LIST(isc_socketevent_t)             recv_list;
299         ISC_LIST(isc_socket_newconnev_t)        accept_list;
300         isc_socket_connev_t                    *connect_ev;
301
302         /*
303          * Internal events.  Posted when a descriptor is readable or
304          * writable.  These are statically allocated and never freed.
305          * They will be set to non-purgable before use.
306          */
307         intev_t                 readable_ev;
308         intev_t                 writable_ev;
309
310         isc_sockaddr_t          peer_address;  /* remote address */
311
312         unsigned int            pending_recv : 1,
313                                 pending_send : 1,
314                                 pending_accept : 1,
315                                 listener : 1, /* listener socket */
316                                 connected : 1,
317                                 connecting : 1, /* connect pending */
318                                 bound : 1; /* bound to local addr */
319
320 #ifdef ISC_NET_RECVOVERFLOW
321         unsigned char           overflow; /* used for MSG_TRUNC fake */
322 #endif
323
324         char                    *recvcmsgbuf;
325         ISC_SOCKADDR_LEN_T      recvcmsgbuflen;
326         char                    *sendcmsgbuf;
327         ISC_SOCKADDR_LEN_T      sendcmsgbuflen;
328
329         void                    *fdwatcharg;
330         isc_sockfdwatch_t       fdwatchcb;
331         int                     fdwatchflags;
332         isc_task_t              *fdwatchtask;
333 };
334
335 #define SOCKET_MANAGER_MAGIC    ISC_MAGIC('I', 'O', 'm', 'g')
336 #define VALID_MANAGER(m)        ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
337
338 struct isc_socketmgr {
339         /* Not locked. */
340         unsigned int            magic;
341         isc_mem_t              *mctx;
342         isc_mutex_t             lock;
343         isc_mutex_t             *fdlock;
344         isc_stats_t             *stats;
345 #ifdef USE_KQUEUE
346         int                     kqueue_fd;
347         int                     nevents;
348         struct kevent           *events;
349 #endif  /* USE_KQUEUE */
350 #ifdef USE_EPOLL
351         int                     epoll_fd;
352         int                     nevents;
353         struct epoll_event      *events;
354 #endif  /* USE_EPOLL */
355 #ifdef USE_DEVPOLL
356         int                     devpoll_fd;
357         int                     nevents;
358         struct pollfd           *events;
359 #endif  /* USE_DEVPOLL */
360 #ifdef USE_SELECT
361         int                     fd_bufsize;
362 #endif  /* USE_SELECT */
363         unsigned int            maxsocks;
364 #ifdef ISC_PLATFORM_USETHREADS
365         int                     pipe_fds[2];
366 #endif
367
368         /* Locked by fdlock. */
369         isc_socket_t           **fds;
370         int                     *fdstate;
371 #ifdef USE_DEVPOLL
372         pollinfo_t              *fdpollinfo;
373 #endif
374
375         /* Locked by manager lock. */
376         ISC_LIST(isc_socket_t)  socklist;
377 #ifdef USE_SELECT
378         fd_set                  *read_fds;
379         fd_set                  *read_fds_copy;
380         fd_set                  *write_fds;
381         fd_set                  *write_fds_copy;
382         int                     maxfd;
383 #endif  /* USE_SELECT */
384         int                     reserved;       /* unlocked */
385 #ifdef ISC_PLATFORM_USETHREADS
386         isc_thread_t            watcher;
387         isc_condition_t         shutdown_ok;
388 #else /* ISC_PLATFORM_USETHREADS */
389         unsigned int            refs;
390 #endif /* ISC_PLATFORM_USETHREADS */
391 };
392
393 #ifndef ISC_PLATFORM_USETHREADS
394 static isc_socketmgr_t *socketmgr = NULL;
395 #endif /* ISC_PLATFORM_USETHREADS */
396
397 #define CLOSED                  0       /* this one must be zero */
398 #define MANAGED                 1
399 #define CLOSE_PENDING           2
400
401 /*
402  * send() and recv() iovec counts
403  */
404 #define MAXSCATTERGATHER_SEND   (ISC_SOCKET_MAXSCATTERGATHER)
405 #ifdef ISC_NET_RECVOVERFLOW
406 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER + 1)
407 #else
408 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER)
409 #endif
410
411 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
412 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
413 static void free_socket(isc_socket_t **);
414 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
415                                     isc_socket_t **);
416 static void destroy(isc_socket_t **);
417 static void internal_accept(isc_task_t *, isc_event_t *);
418 static void internal_connect(isc_task_t *, isc_event_t *);
419 static void internal_recv(isc_task_t *, isc_event_t *);
420 static void internal_send(isc_task_t *, isc_event_t *);
421 static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
422 static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
423 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
424 static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
425                               struct msghdr *, struct iovec *, size_t *);
426 static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
427                               struct msghdr *, struct iovec *, size_t *);
428 #ifdef ISC_PLATFORM_USETHREADS
429 static isc_boolean_t process_ctlfd(isc_socketmgr_t *manager);
430 #endif
431
432 #define SELECT_POKE_SHUTDOWN            (-1)
433 #define SELECT_POKE_NOTHING             (-2)
434 #define SELECT_POKE_READ                (-3)
435 #define SELECT_POKE_ACCEPT              (-3) /*%< Same as _READ */
436 #define SELECT_POKE_WRITE               (-4)
437 #define SELECT_POKE_CONNECT             (-4) /*%< Same as _WRITE */
438 #define SELECT_POKE_CLOSE               (-5)
439
440 #define SOCK_DEAD(s)                    ((s)->references == 0)
441
442 /*%
443  * Shortcut index arrays to get access to statistics counters.
444  */
445 enum {
446         STATID_OPEN = 0,
447         STATID_OPENFAIL = 1,
448         STATID_CLOSE = 2,
449         STATID_BINDFAIL = 3,
450         STATID_CONNECTFAIL = 4,
451         STATID_CONNECT = 5,
452         STATID_ACCEPTFAIL = 6,
453         STATID_ACCEPT = 7,
454         STATID_SENDFAIL = 8,
455         STATID_RECVFAIL = 9
456 };
457 static const isc_statscounter_t upd4statsindex[] = {
458         isc_sockstatscounter_udp4open,
459         isc_sockstatscounter_udp4openfail,
460         isc_sockstatscounter_udp4close,
461         isc_sockstatscounter_udp4bindfail,
462         isc_sockstatscounter_udp4connectfail,
463         isc_sockstatscounter_udp4connect,
464         -1,
465         -1,
466         isc_sockstatscounter_udp4sendfail,
467         isc_sockstatscounter_udp4recvfail
468 };
469 static const isc_statscounter_t upd6statsindex[] = {
470         isc_sockstatscounter_udp6open,
471         isc_sockstatscounter_udp6openfail,
472         isc_sockstatscounter_udp6close,
473         isc_sockstatscounter_udp6bindfail,
474         isc_sockstatscounter_udp6connectfail,
475         isc_sockstatscounter_udp6connect,
476         -1,
477         -1,
478         isc_sockstatscounter_udp6sendfail,
479         isc_sockstatscounter_udp6recvfail
480 };
481 static const isc_statscounter_t tcp4statsindex[] = {
482         isc_sockstatscounter_tcp4open,
483         isc_sockstatscounter_tcp4openfail,
484         isc_sockstatscounter_tcp4close,
485         isc_sockstatscounter_tcp4bindfail,
486         isc_sockstatscounter_tcp4connectfail,
487         isc_sockstatscounter_tcp4connect,
488         isc_sockstatscounter_tcp4acceptfail,
489         isc_sockstatscounter_tcp4accept,
490         isc_sockstatscounter_tcp4sendfail,
491         isc_sockstatscounter_tcp4recvfail
492 };
493 static const isc_statscounter_t tcp6statsindex[] = {
494         isc_sockstatscounter_tcp6open,
495         isc_sockstatscounter_tcp6openfail,
496         isc_sockstatscounter_tcp6close,
497         isc_sockstatscounter_tcp6bindfail,
498         isc_sockstatscounter_tcp6connectfail,
499         isc_sockstatscounter_tcp6connect,
500         isc_sockstatscounter_tcp6acceptfail,
501         isc_sockstatscounter_tcp6accept,
502         isc_sockstatscounter_tcp6sendfail,
503         isc_sockstatscounter_tcp6recvfail
504 };
505 static const isc_statscounter_t unixstatsindex[] = {
506         isc_sockstatscounter_unixopen,
507         isc_sockstatscounter_unixopenfail,
508         isc_sockstatscounter_unixclose,
509         isc_sockstatscounter_unixbindfail,
510         isc_sockstatscounter_unixconnectfail,
511         isc_sockstatscounter_unixconnect,
512         isc_sockstatscounter_unixacceptfail,
513         isc_sockstatscounter_unixaccept,
514         isc_sockstatscounter_unixsendfail,
515         isc_sockstatscounter_unixrecvfail
516 };
517 static const isc_statscounter_t fdwatchstatsindex[] = {
518         -1,
519         -1,
520         isc_sockstatscounter_fdwatchclose,
521         isc_sockstatscounter_fdwatchbindfail,
522         isc_sockstatscounter_fdwatchconnectfail,
523         isc_sockstatscounter_fdwatchconnect,
524         -1,
525         -1,
526         isc_sockstatscounter_fdwatchsendfail,
527         isc_sockstatscounter_fdwatchrecvfail
528 };
529
530 static void
531 manager_log(isc_socketmgr_t *sockmgr,
532             isc_logcategory_t *category, isc_logmodule_t *module, int level,
533             const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
534 static void
535 manager_log(isc_socketmgr_t *sockmgr,
536             isc_logcategory_t *category, isc_logmodule_t *module, int level,
537             const char *fmt, ...)
538 {
539         char msgbuf[2048];
540         va_list ap;
541
542         if (! isc_log_wouldlog(isc_lctx, level))
543                 return;
544
545         va_start(ap, fmt);
546         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
547         va_end(ap);
548
549         isc_log_write(isc_lctx, category, module, level,
550                       "sockmgr %p: %s", sockmgr, msgbuf);
551 }
552
553 static void
554 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
555            isc_logcategory_t *category, isc_logmodule_t *module, int level,
556            isc_msgcat_t *msgcat, int msgset, int message,
557            const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
558 static void
559 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
560            isc_logcategory_t *category, isc_logmodule_t *module, int level,
561            isc_msgcat_t *msgcat, int msgset, int message,
562            const char *fmt, ...)
563 {
564         char msgbuf[2048];
565         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
566         va_list ap;
567
568         if (! isc_log_wouldlog(isc_lctx, level))
569                 return;
570
571         va_start(ap, fmt);
572         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
573         va_end(ap);
574
575         if (address == NULL) {
576                 isc_log_iwrite(isc_lctx, category, module, level,
577                                msgcat, msgset, message,
578                                "socket %p: %s", sock, msgbuf);
579         } else {
580                 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
581                 isc_log_iwrite(isc_lctx, category, module, level,
582                                msgcat, msgset, message,
583                                "socket %p %s: %s", sock, peerbuf, msgbuf);
584         }
585 }
586
587 #if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
588     defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
589 /*
590  * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
591  * setting IPV6_V6ONLY.
592  */
593 static void
594 FIX_IPV6_RECVPKTINFO(isc_socket_t *sock)
595 {
596         char strbuf[ISC_STRERRORSIZE];
597         int on = 1;
598
599         if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
600                 return;
601
602         if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
603                        (void *)&on, sizeof(on)) < 0) {
604
605                 UNEXPECTED_ERROR(__FILE__, __LINE__,
606                                  "setsockopt(%d, IPV6_RECVPKTINFO) "
607                                  "%s: %s", sock->fd,
608                                  isc_msgcat_get(isc_msgcat,
609                                                 ISC_MSGSET_GENERAL,
610                                                 ISC_MSG_FAILED,
611                                                 "failed"),
612                                  strbuf);
613         }
614 }
615 #else
616 #define FIX_IPV6_RECVPKTINFO(sock) (void)0
617 #endif
618
619 /*%
620  * Increment socket-related statistics counters.
621  */
622 static inline void
623 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
624         REQUIRE(counterid != -1);
625
626         if (stats != NULL)
627                 isc_stats_increment(stats, counterid);
628 }
629
630 static inline isc_result_t
631 watch_fd(isc_socketmgr_t *manager, int fd, int msg) {
632         isc_result_t result = ISC_R_SUCCESS;
633
634 #ifdef USE_KQUEUE
635         struct kevent evchange;
636
637         memset(&evchange, 0, sizeof(evchange));
638         if (msg == SELECT_POKE_READ)
639                 evchange.filter = EVFILT_READ;
640         else
641                 evchange.filter = EVFILT_WRITE;
642         evchange.flags = EV_ADD;
643         evchange.ident = fd;
644         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
645                 result = isc__errno2result(errno);
646
647         return (result);
648 #elif defined(USE_EPOLL)
649         struct epoll_event event;
650
651         if (msg == SELECT_POKE_READ)
652                 event.events = EPOLLIN;
653         else
654                 event.events = EPOLLOUT;
655         event.data.fd = fd;
656         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
657             errno != EEXIST) {
658                 result = isc__errno2result(errno);
659         }
660
661         return (result);
662 #elif defined(USE_DEVPOLL)
663         struct pollfd pfd;
664         int lockid = FDLOCK_ID(fd);
665
666         memset(&pfd, 0, sizeof(pfd));
667         if (msg == SELECT_POKE_READ)
668                 pfd.events = POLLIN;
669         else
670                 pfd.events = POLLOUT;
671         pfd.fd = fd;
672         pfd.revents = 0;
673         LOCK(&manager->fdlock[lockid]);
674         if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
675                 result = isc__errno2result(errno);
676         else {
677                 if (msg == SELECT_POKE_READ)
678                         manager->fdpollinfo[fd].want_read = 1;
679                 else
680                         manager->fdpollinfo[fd].want_write = 1;
681         }
682         UNLOCK(&manager->fdlock[lockid]);
683
684         return (result);
685 #elif defined(USE_SELECT)
686         LOCK(&manager->lock);
687         if (msg == SELECT_POKE_READ)
688                 FD_SET(fd, manager->read_fds);
689         if (msg == SELECT_POKE_WRITE)
690                 FD_SET(fd, manager->write_fds);
691         UNLOCK(&manager->lock);
692
693         return (result);
694 #endif
695 }
696
697 static inline isc_result_t
698 unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) {
699         isc_result_t result = ISC_R_SUCCESS;
700
701 #ifdef USE_KQUEUE
702         struct kevent evchange;
703
704         memset(&evchange, 0, sizeof(evchange));
705         if (msg == SELECT_POKE_READ)
706                 evchange.filter = EVFILT_READ;
707         else
708                 evchange.filter = EVFILT_WRITE;
709         evchange.flags = EV_DELETE;
710         evchange.ident = fd;
711         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
712                 result = isc__errno2result(errno);
713
714         return (result);
715 #elif defined(USE_EPOLL)
716         struct epoll_event event;
717
718         if (msg == SELECT_POKE_READ)
719                 event.events = EPOLLIN;
720         else
721                 event.events = EPOLLOUT;
722         event.data.fd = fd;
723         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
724             errno != ENOENT) {
725                 char strbuf[ISC_STRERRORSIZE];
726                 isc__strerror(errno, strbuf, sizeof(strbuf));
727                 UNEXPECTED_ERROR(__FILE__, __LINE__,
728                                  "epoll_ctl(DEL), %d: %s", fd, strbuf);
729                 result = ISC_R_UNEXPECTED;
730         }
731         return (result);
732 #elif defined(USE_DEVPOLL)
733         struct pollfd pfds[2];
734         size_t writelen = sizeof(pfds[0]);
735         int lockid = FDLOCK_ID(fd);
736
737         memset(pfds, 0, sizeof(pfds));
738         pfds[0].events = POLLREMOVE;
739         pfds[0].fd = fd;
740
741         /*
742          * Canceling read or write polling via /dev/poll is tricky.  Since it
743          * only provides a way of canceling per FD, we may need to re-poll the
744          * socket for the other operation.
745          */
746         LOCK(&manager->fdlock[lockid]);
747         if (msg == SELECT_POKE_READ &&
748             manager->fdpollinfo[fd].want_write == 1) {
749                 pfds[1].events = POLLOUT;
750                 pfds[1].fd = fd;
751                 writelen += sizeof(pfds[1]);
752         }
753         if (msg == SELECT_POKE_WRITE &&
754             manager->fdpollinfo[fd].want_read == 1) {
755                 pfds[1].events = POLLIN;
756                 pfds[1].fd = fd;
757                 writelen += sizeof(pfds[1]);
758         }
759
760         if (write(manager->devpoll_fd, pfds, writelen) == -1)
761                 result = isc__errno2result(errno);
762         else {
763                 if (msg == SELECT_POKE_READ)
764                         manager->fdpollinfo[fd].want_read = 0;
765                 else
766                         manager->fdpollinfo[fd].want_write = 0;
767         }
768         UNLOCK(&manager->fdlock[lockid]);
769
770         return (result);
771 #elif defined(USE_SELECT)
772         LOCK(&manager->lock);
773         if (msg == SELECT_POKE_READ)
774                 FD_CLR(fd, manager->read_fds);
775         else if (msg == SELECT_POKE_WRITE)
776                 FD_CLR(fd, manager->write_fds);
777         UNLOCK(&manager->lock);
778
779         return (result);
780 #endif
781 }
782
783 static void
784 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
785         isc_result_t result;
786         int lockid = FDLOCK_ID(fd);
787
788         /*
789          * This is a wakeup on a socket.  If the socket is not in the
790          * process of being closed, start watching it for either reads
791          * or writes.
792          */
793
794         INSIST(fd >= 0 && fd < (int)manager->maxsocks);
795
796         if (msg == SELECT_POKE_CLOSE) {
797                 /* No one should be updating fdstate, so no need to lock it */
798                 INSIST(manager->fdstate[fd] == CLOSE_PENDING);
799                 manager->fdstate[fd] = CLOSED;
800                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
801                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
802                 (void)close(fd);
803                 return;
804         }
805
806         LOCK(&manager->fdlock[lockid]);
807         if (manager->fdstate[fd] == CLOSE_PENDING) {
808                 UNLOCK(&manager->fdlock[lockid]);
809
810                 /*
811                  * We accept (and ignore) any error from unwatch_fd() as we are
812                  * closing the socket, hoping it doesn't leave dangling state in
813                  * the kernel.
814                  * Note that unwatch_fd() must be called after releasing the
815                  * fdlock; otherwise it could cause deadlock due to a lock order
816                  * reversal.
817                  */
818                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
819                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
820                 return;
821         }
822         if (manager->fdstate[fd] != MANAGED) {
823                 UNLOCK(&manager->fdlock[lockid]);
824                 return;
825         }
826         UNLOCK(&manager->fdlock[lockid]);
827
828         /*
829          * Set requested bit.
830          */
831         result = watch_fd(manager, fd, msg);
832         if (result != ISC_R_SUCCESS) {
833                 /*
834                  * XXXJT: what should we do?  Ignoring the failure of watching
835                  * a socket will make the application dysfunctional, but there
836                  * seems to be no reasonable recovery process.
837                  */
838                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
839                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
840                               "failed to start watching FD (%d): %s",
841                               fd, isc_result_totext(result));
842         }
843 }
844
845 #ifdef ISC_PLATFORM_USETHREADS
846 /*
847  * Poke the select loop when there is something for us to do.
848  * The write is required (by POSIX) to complete.  That is, we
849  * will not get partial writes.
850  */
851 static void
852 select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
853         int cc;
854         int buf[2];
855         char strbuf[ISC_STRERRORSIZE];
856
857         buf[0] = fd;
858         buf[1] = msg;
859
860         do {
861                 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
862 #ifdef ENOSR
863                 /*
864                  * Treat ENOSR as EAGAIN but loop slowly as it is
865                  * unlikely to clear fast.
866                  */
867                 if (cc < 0 && errno == ENOSR) {
868                         sleep(1);
869                         errno = EAGAIN;
870                 }
871 #endif
872         } while (cc < 0 && SOFT_ERROR(errno));
873
874         if (cc < 0) {
875                 isc__strerror(errno, strbuf, sizeof(strbuf));
876                 FATAL_ERROR(__FILE__, __LINE__,
877                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
878                                            ISC_MSG_WRITEFAILED,
879                                            "write() failed "
880                                            "during watcher poke: %s"),
881                             strbuf);
882         }
883
884         INSIST(cc == sizeof(buf));
885 }
886
887 /*
888  * Read a message on the internal fd.
889  */
890 static void
891 select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
892         int buf[2];
893         int cc;
894         char strbuf[ISC_STRERRORSIZE];
895
896         cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
897         if (cc < 0) {
898                 *msg = SELECT_POKE_NOTHING;
899                 *fd = -1;       /* Silence compiler. */
900                 if (SOFT_ERROR(errno))
901                         return;
902
903                 isc__strerror(errno, strbuf, sizeof(strbuf));
904                 FATAL_ERROR(__FILE__, __LINE__,
905                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
906                                            ISC_MSG_READFAILED,
907                                            "read() failed "
908                                            "during watcher poke: %s"),
909                             strbuf);
910
911                 return;
912         }
913         INSIST(cc == sizeof(buf));
914
915         *fd = buf[0];
916         *msg = buf[1];
917 }
918 #else /* ISC_PLATFORM_USETHREADS */
919 /*
920  * Update the state of the socketmgr when something changes.
921  */
922 static void
923 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
924         if (msg == SELECT_POKE_SHUTDOWN)
925                 return;
926         else if (fd >= 0)
927                 wakeup_socket(manager, fd, msg);
928         return;
929 }
930 #endif /* ISC_PLATFORM_USETHREADS */
931
932 /*
933  * Make a fd non-blocking.
934  */
935 static isc_result_t
936 make_nonblock(int fd) {
937         int ret;
938         int flags;
939         char strbuf[ISC_STRERRORSIZE];
940 #ifdef USE_FIONBIO_IOCTL
941         int on = 1;
942
943         ret = ioctl(fd, FIONBIO, (char *)&on);
944 #else
945         flags = fcntl(fd, F_GETFL, 0);
946         flags |= PORT_NONBLOCK;
947         ret = fcntl(fd, F_SETFL, flags);
948 #endif
949
950         if (ret == -1) {
951                 isc__strerror(errno, strbuf, sizeof(strbuf));
952                 UNEXPECTED_ERROR(__FILE__, __LINE__,
953 #ifdef USE_FIONBIO_IOCTL
954                                  "ioctl(%d, FIONBIO, &on): %s", fd,
955 #else
956                                  "fcntl(%d, F_SETFL, %d): %s", fd, flags,
957 #endif
958                                  strbuf);
959
960                 return (ISC_R_UNEXPECTED);
961         }
962
963         return (ISC_R_SUCCESS);
964 }
965
966 #ifdef USE_CMSG
967 /*
968  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
969  * In order to ensure as much portability as possible, we provide wrapper
970  * functions of these macros.
971  * Note that cmsg_space() could run slow on OSes that do not have
972  * CMSG_SPACE.
973  */
974 static inline ISC_SOCKADDR_LEN_T
975 cmsg_len(ISC_SOCKADDR_LEN_T len) {
976 #ifdef CMSG_LEN
977         return (CMSG_LEN(len));
978 #else
979         ISC_SOCKADDR_LEN_T hdrlen;
980
981         /*
982          * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
983          * is correct.
984          */
985         hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
986         return (hdrlen + len);
987 #endif
988 }
989
990 static inline ISC_SOCKADDR_LEN_T
991 cmsg_space(ISC_SOCKADDR_LEN_T len) {
992 #ifdef CMSG_SPACE
993         return (CMSG_SPACE(len));
994 #else
995         struct msghdr msg;
996         struct cmsghdr *cmsgp;
997         /*
998          * XXX: The buffer length is an ad-hoc value, but should be enough
999          * in a practical sense.
1000          */
1001         char dummybuf[sizeof(struct cmsghdr) + 1024];
1002
1003         memset(&msg, 0, sizeof(msg));
1004         msg.msg_control = dummybuf;
1005         msg.msg_controllen = sizeof(dummybuf);
1006
1007         cmsgp = (struct cmsghdr *)dummybuf;
1008         cmsgp->cmsg_len = cmsg_len(len);
1009
1010         cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1011         if (cmsgp != NULL)
1012                 return ((char *)cmsgp - (char *)msg.msg_control);
1013         else
1014                 return (0);
1015 #endif
1016 }
1017 #endif /* USE_CMSG */
1018
1019 /*
1020  * Process control messages received on a socket.
1021  */
1022 static void
1023 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1024 #ifdef USE_CMSG
1025         struct cmsghdr *cmsgp;
1026 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1027         struct in6_pktinfo *pktinfop;
1028 #endif
1029 #ifdef SO_TIMESTAMP
1030         struct timeval *timevalp;
1031 #endif
1032 #endif
1033
1034         /*
1035          * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1036          * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1037          * They are all here, outside of the CPP tests, because it is
1038          * more consistent with the usual ISC coding style.
1039          */
1040         UNUSED(sock);
1041         UNUSED(msg);
1042         UNUSED(dev);
1043
1044 #ifdef ISC_NET_BSD44MSGHDR
1045
1046 #ifdef MSG_TRUNC
1047         if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
1048                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1049 #endif
1050
1051 #ifdef MSG_CTRUNC
1052         if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
1053                 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1054 #endif
1055
1056 #ifndef USE_CMSG
1057         return;
1058 #else
1059         if (msg->msg_controllen == 0U || msg->msg_control == NULL)
1060                 return;
1061
1062 #ifdef SO_TIMESTAMP
1063         timevalp = NULL;
1064 #endif
1065 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1066         pktinfop = NULL;
1067 #endif
1068
1069         cmsgp = CMSG_FIRSTHDR(msg);
1070         while (cmsgp != NULL) {
1071                 socket_log(sock, NULL, TRACE,
1072                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
1073                            "processing cmsg %p", cmsgp);
1074
1075 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1076                 if (cmsgp->cmsg_level == IPPROTO_IPV6
1077                     && cmsgp->cmsg_type == IPV6_PKTINFO) {
1078
1079                         pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1080                         memcpy(&dev->pktinfo, pktinfop,
1081                                sizeof(struct in6_pktinfo));
1082                         dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1083                         socket_log(sock, NULL, TRACE,
1084                                    isc_msgcat, ISC_MSGSET_SOCKET,
1085                                    ISC_MSG_IFRECEIVED,
1086                                    "interface received on ifindex %u",
1087                                    dev->pktinfo.ipi6_ifindex);
1088                         if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
1089                                 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1090                         goto next;
1091                 }
1092 #endif
1093
1094 #ifdef SO_TIMESTAMP
1095                 if (cmsgp->cmsg_level == SOL_SOCKET
1096                     && cmsgp->cmsg_type == SCM_TIMESTAMP) {
1097                         timevalp = (struct timeval *)CMSG_DATA(cmsgp);
1098                         dev->timestamp.seconds = timevalp->tv_sec;
1099                         dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
1100                         dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1101                         goto next;
1102                 }
1103 #endif
1104
1105         next:
1106                 cmsgp = CMSG_NXTHDR(msg, cmsgp);
1107         }
1108 #endif /* USE_CMSG */
1109
1110 #endif /* ISC_NET_BSD44MSGHDR */
1111 }
1112
1113 /*
1114  * Construct an iov array and attach it to the msghdr passed in.  This is
1115  * the SEND constructor, which will use the used region of the buffer
1116  * (if using a buffer list) or will use the internal region (if a single
1117  * buffer I/O is requested).
1118  *
1119  * Nothing can be NULL, and the done event must list at least one buffer
1120  * on the buffer linked list for this function to be meaningful.
1121  *
1122  * If write_countp != NULL, *write_countp will hold the number of bytes
1123  * this transaction can send.
1124  */
1125 static void
1126 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
1127                   struct msghdr *msg, struct iovec *iov, size_t *write_countp)
1128 {
1129         unsigned int iovcount;
1130         isc_buffer_t *buffer;
1131         isc_region_t used;
1132         size_t write_count;
1133         size_t skip_count;
1134
1135         memset(msg, 0, sizeof(*msg));
1136
1137         if (!sock->connected) {
1138                 msg->msg_name = (void *)&dev->address.type.sa;
1139                 msg->msg_namelen = dev->address.length;
1140         } else {
1141                 msg->msg_name = NULL;
1142                 msg->msg_namelen = 0;
1143         }
1144
1145         buffer = ISC_LIST_HEAD(dev->bufferlist);
1146         write_count = 0;
1147         iovcount = 0;
1148
1149         /*
1150          * Single buffer I/O?  Skip what we've done so far in this region.
1151          */
1152         if (buffer == NULL) {
1153                 write_count = dev->region.length - dev->n;
1154                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1155                 iov[0].iov_len = write_count;
1156                 iovcount = 1;
1157
1158                 goto config;
1159         }
1160
1161         /*
1162          * Multibuffer I/O.
1163          * Skip the data in the buffer list that we have already written.
1164          */
1165         skip_count = dev->n;
1166         while (buffer != NULL) {
1167                 REQUIRE(ISC_BUFFER_VALID(buffer));
1168                 if (skip_count < isc_buffer_usedlength(buffer))
1169                         break;
1170                 skip_count -= isc_buffer_usedlength(buffer);
1171                 buffer = ISC_LIST_NEXT(buffer, link);
1172         }
1173
1174         while (buffer != NULL) {
1175                 INSIST(iovcount < MAXSCATTERGATHER_SEND);
1176
1177                 isc_buffer_usedregion(buffer, &used);
1178
1179                 if (used.length > 0) {
1180                         iov[iovcount].iov_base = (void *)(used.base
1181                                                           + skip_count);
1182                         iov[iovcount].iov_len = used.length - skip_count;
1183                         write_count += (used.length - skip_count);
1184                         skip_count = 0;
1185                         iovcount++;
1186                 }
1187                 buffer = ISC_LIST_NEXT(buffer, link);
1188         }
1189
1190         INSIST(skip_count == 0U);
1191
1192  config:
1193         msg->msg_iov = iov;
1194         msg->msg_iovlen = iovcount;
1195
1196 #ifdef ISC_NET_BSD44MSGHDR
1197         msg->msg_control = NULL;
1198         msg->msg_controllen = 0;
1199         msg->msg_flags = 0;
1200 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1201         if ((sock->type == isc_sockettype_udp)
1202             && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
1203                 struct cmsghdr *cmsgp;
1204                 struct in6_pktinfo *pktinfop;
1205
1206                 socket_log(sock, NULL, TRACE,
1207                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
1208                            "sendto pktinfo data, ifindex %u",
1209                            dev->pktinfo.ipi6_ifindex);
1210
1211                 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1212                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1213                 msg->msg_control = (void *)sock->sendcmsgbuf;
1214
1215                 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
1216                 cmsgp->cmsg_level = IPPROTO_IPV6;
1217                 cmsgp->cmsg_type = IPV6_PKTINFO;
1218                 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1219                 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1220                 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1221         }
1222 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
1223 #else /* ISC_NET_BSD44MSGHDR */
1224         msg->msg_accrights = NULL;
1225         msg->msg_accrightslen = 0;
1226 #endif /* ISC_NET_BSD44MSGHDR */
1227
1228         if (write_countp != NULL)
1229                 *write_countp = write_count;
1230 }
1231
1232 /*
1233  * Construct an iov array and attach it to the msghdr passed in.  This is
1234  * the RECV constructor, which will use the available region of the buffer
1235  * (if using a buffer list) or will use the internal region (if a single
1236  * buffer I/O is requested).
1237  *
1238  * Nothing can be NULL, and the done event must list at least one buffer
1239  * on the buffer linked list for this function to be meaningful.
1240  *
1241  * If read_countp != NULL, *read_countp will hold the number of bytes
1242  * this transaction can receive.
1243  */
1244 static void
1245 build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
1246                   struct msghdr *msg, struct iovec *iov, size_t *read_countp)
1247 {
1248         unsigned int iovcount;
1249         isc_buffer_t *buffer;
1250         isc_region_t available;
1251         size_t read_count;
1252
1253         memset(msg, 0, sizeof(struct msghdr));
1254
1255         if (sock->type == isc_sockettype_udp) {
1256                 memset(&dev->address, 0, sizeof(dev->address));
1257 #ifdef BROKEN_RECVMSG
1258                 if (sock->pf == AF_INET) {
1259                         msg->msg_name = (void *)&dev->address.type.sin;
1260                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1261                 } else if (sock->pf == AF_INET6) {
1262                         msg->msg_name = (void *)&dev->address.type.sin6;
1263                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1264 #ifdef ISC_PLATFORM_HAVESYSUNH
1265                 } else if (sock->pf == AF_UNIX) {
1266                         msg->msg_name = (void *)&dev->address.type.sunix;
1267                         msg->msg_namelen = sizeof(dev->address.type.sunix);
1268 #endif
1269                 } else {
1270                         msg->msg_name = (void *)&dev->address.type.sa;
1271                         msg->msg_namelen = sizeof(dev->address.type);
1272                 }
1273 #else
1274                 msg->msg_name = (void *)&dev->address.type.sa;
1275                 msg->msg_namelen = sizeof(dev->address.type);
1276 #endif
1277 #ifdef ISC_NET_RECVOVERFLOW
1278                 /* If needed, steal one iovec for overflow detection. */
1279                 maxiov--;
1280 #endif
1281         } else { /* TCP */
1282                 msg->msg_name = NULL;
1283                 msg->msg_namelen = 0;
1284                 dev->address = sock->peer_address;
1285         }
1286
1287         buffer = ISC_LIST_HEAD(dev->bufferlist);
1288         read_count = 0;
1289
1290         /*
1291          * Single buffer I/O?  Skip what we've done so far in this region.
1292          */
1293         if (buffer == NULL) {
1294                 read_count = dev->region.length - dev->n;
1295                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1296                 iov[0].iov_len = read_count;
1297                 iovcount = 1;
1298
1299                 goto config;
1300         }
1301
1302         /*
1303          * Multibuffer I/O.
1304          * Skip empty buffers.
1305          */
1306         while (buffer != NULL) {
1307                 REQUIRE(ISC_BUFFER_VALID(buffer));
1308                 if (isc_buffer_availablelength(buffer) != 0)
1309                         break;
1310                 buffer = ISC_LIST_NEXT(buffer, link);
1311         }
1312
1313         iovcount = 0;
1314         while (buffer != NULL) {
1315                 INSIST(iovcount < MAXSCATTERGATHER_RECV);
1316
1317                 isc_buffer_availableregion(buffer, &available);
1318
1319                 if (available.length > 0) {
1320                         iov[iovcount].iov_base = (void *)(available.base);
1321                         iov[iovcount].iov_len = available.length;
1322                         read_count += available.length;
1323                         iovcount++;
1324                 }
1325                 buffer = ISC_LIST_NEXT(buffer, link);
1326         }
1327
1328  config:
1329
1330         /*
1331          * If needed, set up to receive that one extra byte.  Note that
1332          * we know there is at least one iov left, since we stole it
1333          * at the top of this function.
1334          */
1335 #ifdef ISC_NET_RECVOVERFLOW
1336         if (sock->type == isc_sockettype_udp) {
1337                 iov[iovcount].iov_base = (void *)(&sock->overflow);
1338                 iov[iovcount].iov_len = 1;
1339                 iovcount++;
1340         }
1341 #endif
1342
1343         msg->msg_iov = iov;
1344         msg->msg_iovlen = iovcount;
1345
1346 #ifdef ISC_NET_BSD44MSGHDR
1347         msg->msg_control = NULL;
1348         msg->msg_controllen = 0;
1349         msg->msg_flags = 0;
1350 #if defined(USE_CMSG)
1351         if (sock->type == isc_sockettype_udp) {
1352                 msg->msg_control = sock->recvcmsgbuf;
1353                 msg->msg_controllen = sock->recvcmsgbuflen;
1354         }
1355 #endif /* USE_CMSG */
1356 #else /* ISC_NET_BSD44MSGHDR */
1357         msg->msg_accrights = NULL;
1358         msg->msg_accrightslen = 0;
1359 #endif /* ISC_NET_BSD44MSGHDR */
1360
1361         if (read_countp != NULL)
1362                 *read_countp = read_count;
1363 }
1364
1365 static void
1366 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1367                 isc_socketevent_t *dev)
1368 {
1369         if (sock->type == isc_sockettype_udp) {
1370                 if (address != NULL)
1371                         dev->address = *address;
1372                 else
1373                         dev->address = sock->peer_address;
1374         } else if (sock->type == isc_sockettype_tcp) {
1375                 INSIST(address == NULL);
1376                 dev->address = sock->peer_address;
1377         }
1378 }
1379
1380 static void
1381 destroy_socketevent(isc_event_t *event) {
1382         isc_socketevent_t *ev = (isc_socketevent_t *)event;
1383
1384         INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1385
1386         (ev->destroy)(event);
1387 }
1388
1389 static isc_socketevent_t *
1390 allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1391                      isc_taskaction_t action, const void *arg)
1392 {
1393         isc_socketevent_t *ev;
1394
1395         ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1396                                                      sock, eventtype,
1397                                                      action, arg,
1398                                                      sizeof(*ev));
1399
1400         if (ev == NULL)
1401                 return (NULL);
1402
1403         ev->result = ISC_R_UNEXPECTED;
1404         ISC_LINK_INIT(ev, ev_link);
1405         ISC_LIST_INIT(ev->bufferlist);
1406         ev->region.base = NULL;
1407         ev->n = 0;
1408         ev->offset = 0;
1409         ev->attributes = 0;
1410         ev->destroy = ev->ev_destroy;
1411         ev->ev_destroy = destroy_socketevent;
1412
1413         return (ev);
1414 }
1415
1416 #if defined(ISC_SOCKET_DEBUG)
1417 static void
1418 dump_msg(struct msghdr *msg) {
1419         unsigned int i;
1420
1421         printf("MSGHDR %p\n", msg);
1422         printf("\tname %p, namelen %ld\n", msg->msg_name,
1423                (long) msg->msg_namelen);
1424         printf("\tiov %p, iovlen %ld\n", msg->msg_iov,
1425                (long) msg->msg_iovlen);
1426         for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1427                 printf("\t\t%d\tbase %p, len %ld\n", i,
1428                        msg->msg_iov[i].iov_base,
1429                        (long) msg->msg_iov[i].iov_len);
1430 #ifdef ISC_NET_BSD44MSGHDR
1431         printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1432                (long) msg->msg_controllen);
1433 #endif
1434 }
1435 #endif
1436
1437 #define DOIO_SUCCESS            0       /* i/o ok, event sent */
1438 #define DOIO_SOFT               1       /* i/o ok, soft error, no event sent */
1439 #define DOIO_HARD               2       /* i/o error, event sent */
1440 #define DOIO_EOF                3       /* EOF, no event sent */
1441
1442 static int
1443 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1444         int cc;
1445         struct iovec iov[MAXSCATTERGATHER_RECV];
1446         size_t read_count;
1447         size_t actual_count;
1448         struct msghdr msghdr;
1449         isc_buffer_t *buffer;
1450         int recv_errno;
1451         char strbuf[ISC_STRERRORSIZE];
1452
1453         build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
1454
1455 #if defined(ISC_SOCKET_DEBUG)
1456         dump_msg(&msghdr);
1457 #endif
1458
1459         cc = recvmsg(sock->fd, &msghdr, 0);
1460         recv_errno = errno;
1461
1462 #if defined(ISC_SOCKET_DEBUG)
1463         dump_msg(&msghdr);
1464 #endif
1465
1466         if (cc < 0) {
1467                 if (SOFT_ERROR(recv_errno))
1468                         return (DOIO_SOFT);
1469
1470                 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1471                         isc__strerror(recv_errno, strbuf, sizeof(strbuf));
1472                         socket_log(sock, NULL, IOEVENT,
1473                                    isc_msgcat, ISC_MSGSET_SOCKET,
1474                                    ISC_MSG_DOIORECV,
1475                                   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1476                                    sock->fd, cc, recv_errno, strbuf);
1477                 }
1478
1479 #define SOFT_OR_HARD(_system, _isc) \
1480         if (recv_errno == _system) { \
1481                 if (sock->connected) { \
1482                         dev->result = _isc; \
1483                         inc_stats(sock->manager->stats, \
1484                                   sock->statsindex[STATID_RECVFAIL]); \
1485                         return (DOIO_HARD); \
1486                 } \
1487                 return (DOIO_SOFT); \
1488         }
1489 #define ALWAYS_HARD(_system, _isc) \
1490         if (recv_errno == _system) { \
1491                 dev->result = _isc; \
1492                 inc_stats(sock->manager->stats, \
1493                           sock->statsindex[STATID_RECVFAIL]); \
1494                 return (DOIO_HARD); \
1495         }
1496
1497                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1498                 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1499                 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1500                 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1501                 /* HPUX 11.11 can return EADDRNOTAVAIL. */
1502                 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1503                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1504                 /*
1505                  * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
1506                  * errors.
1507                  */
1508 #ifdef EPROTO
1509                 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1510 #endif
1511                 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1512
1513 #undef SOFT_OR_HARD
1514 #undef ALWAYS_HARD
1515
1516                 dev->result = isc__errno2result(recv_errno);
1517                 inc_stats(sock->manager->stats,
1518                           sock->statsindex[STATID_RECVFAIL]);
1519                 return (DOIO_HARD);
1520         }
1521
1522         /*
1523          * On TCP and UNIX sockets, zero length reads indicate EOF,
1524          * while on UDP sockets, zero length reads are perfectly valid,
1525          * although strange.
1526          */
1527         switch (sock->type) {
1528         case isc_sockettype_tcp:
1529         case isc_sockettype_unix:
1530                 if (cc == 0)
1531                         return (DOIO_EOF);
1532                 break;
1533         case isc_sockettype_udp:
1534                 break;
1535         case isc_sockettype_fdwatch:
1536         default:
1537                 INSIST(0);
1538         }
1539
1540         if (sock->type == isc_sockettype_udp) {
1541                 dev->address.length = msghdr.msg_namelen;
1542                 if (isc_sockaddr_getport(&dev->address) == 0) {
1543                         if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1544                                 socket_log(sock, &dev->address, IOEVENT,
1545                                            isc_msgcat, ISC_MSGSET_SOCKET,
1546                                            ISC_MSG_ZEROPORT,
1547                                            "dropping source port zero packet");
1548                         }
1549                         return (DOIO_SOFT);
1550                 }
1551         }
1552
1553         socket_log(sock, &dev->address, IOEVENT,
1554                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1555                    "packet received correctly");
1556
1557         /*
1558          * Overflow bit detection.  If we received MORE bytes than we should,
1559          * this indicates an overflow situation.  Set the flag in the
1560          * dev entry and adjust how much we read by one.
1561          */
1562 #ifdef ISC_NET_RECVOVERFLOW
1563         if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1564                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1565                 cc--;
1566         }
1567 #endif
1568
1569         /*
1570          * If there are control messages attached, run through them and pull
1571          * out the interesting bits.
1572          */
1573         if (sock->type == isc_sockettype_udp)
1574                 process_cmsg(sock, &msghdr, dev);
1575
1576         /*
1577          * update the buffers (if any) and the i/o count
1578          */
1579         dev->n += cc;
1580         actual_count = cc;
1581         buffer = ISC_LIST_HEAD(dev->bufferlist);
1582         while (buffer != NULL && actual_count > 0U) {
1583                 REQUIRE(ISC_BUFFER_VALID(buffer));
1584                 if (isc_buffer_availablelength(buffer) <= actual_count) {
1585                         actual_count -= isc_buffer_availablelength(buffer);
1586                         isc_buffer_add(buffer,
1587                                        isc_buffer_availablelength(buffer));
1588                 } else {
1589                         isc_buffer_add(buffer, actual_count);
1590                         actual_count = 0;
1591                         break;
1592                 }
1593                 buffer = ISC_LIST_NEXT(buffer, link);
1594                 if (buffer == NULL) {
1595                         INSIST(actual_count == 0U);
1596                 }
1597         }
1598
1599         /*
1600          * If we read less than we expected, update counters,
1601          * and let the upper layer poke the descriptor.
1602          */
1603         if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1604                 return (DOIO_SOFT);
1605
1606         /*
1607          * Full reads are posted, or partials if partials are ok.
1608          */
1609         dev->result = ISC_R_SUCCESS;
1610         return (DOIO_SUCCESS);
1611 }
1612
1613 /*
1614  * Returns:
1615  *      DOIO_SUCCESS    The operation succeeded.  dev->result contains
1616  *                      ISC_R_SUCCESS.
1617  *
1618  *      DOIO_HARD       A hard or unexpected I/O error was encountered.
1619  *                      dev->result contains the appropriate error.
1620  *
1621  *      DOIO_SOFT       A soft I/O error was encountered.  No senddone
1622  *                      event was sent.  The operation should be retried.
1623  *
1624  *      No other return values are possible.
1625  */
1626 static int
1627 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1628         int cc;
1629         struct iovec iov[MAXSCATTERGATHER_SEND];
1630         size_t write_count;
1631         struct msghdr msghdr;
1632         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1633         int attempts = 0;
1634         int send_errno;
1635         char strbuf[ISC_STRERRORSIZE];
1636
1637         build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1638
1639  resend:
1640         cc = sendmsg(sock->fd, &msghdr, 0);
1641         send_errno = errno;
1642
1643         /*
1644          * Check for error or block condition.
1645          */
1646         if (cc < 0) {
1647                 if (send_errno == EINTR && ++attempts < NRETRIES)
1648                         goto resend;
1649
1650                 if (SOFT_ERROR(send_errno))
1651                         return (DOIO_SOFT);
1652
1653 #define SOFT_OR_HARD(_system, _isc) \
1654         if (send_errno == _system) { \
1655                 if (sock->connected) { \
1656                         dev->result = _isc; \
1657                         inc_stats(sock->manager->stats, \
1658                                   sock->statsindex[STATID_SENDFAIL]); \
1659                         return (DOIO_HARD); \
1660                 } \
1661                 return (DOIO_SOFT); \
1662         }
1663 #define ALWAYS_HARD(_system, _isc) \
1664         if (send_errno == _system) { \
1665                 dev->result = _isc; \
1666                 inc_stats(sock->manager->stats, \
1667                           sock->statsindex[STATID_SENDFAIL]); \
1668                 return (DOIO_HARD); \
1669         }
1670
1671                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1672                 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1673                 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1674                 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1675                 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1676 #ifdef EHOSTDOWN
1677                 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1678 #endif
1679                 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1680                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1681                 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1682                 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1683                 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1684
1685 #undef SOFT_OR_HARD
1686 #undef ALWAYS_HARD
1687
1688                 /*
1689                  * The other error types depend on whether or not the
1690                  * socket is UDP or TCP.  If it is UDP, some errors
1691                  * that we expect to be fatal under TCP are merely
1692                  * annoying, and are really soft errors.
1693                  *
1694                  * However, these soft errors are still returned as
1695                  * a status.
1696                  */
1697                 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1698                 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1699                 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1700                                  addrbuf, strbuf);
1701                 dev->result = isc__errno2result(send_errno);
1702                 inc_stats(sock->manager->stats,
1703                           sock->statsindex[STATID_SENDFAIL]);
1704                 return (DOIO_HARD);
1705         }
1706
1707         if (cc == 0) {
1708                 inc_stats(sock->manager->stats,
1709                           sock->statsindex[STATID_SENDFAIL]);
1710                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1711                                  "doio_send: send() %s 0",
1712                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1713                                                 ISC_MSG_RETURNED, "returned"));
1714         }
1715
1716         /*
1717          * If we write less than we expected, update counters, poke.
1718          */
1719         dev->n += cc;
1720         if ((size_t)cc != write_count)
1721                 return (DOIO_SOFT);
1722
1723         /*
1724          * Exactly what we wanted to write.  We're done with this
1725          * entry.  Post its completion event.
1726          */
1727         dev->result = ISC_R_SUCCESS;
1728         return (DOIO_SUCCESS);
1729 }
1730
1731 /*
1732  * Kill.
1733  *
1734  * Caller must ensure that the socket is not locked and no external
1735  * references exist.
1736  */
1737 static void
1738 closesocket(isc_socketmgr_t *manager, isc_socket_t *sock, int fd) {
1739         isc_sockettype_t type = sock->type;
1740         int lockid = FDLOCK_ID(fd);
1741
1742         /*
1743          * No one has this socket open, so the watcher doesn't have to be
1744          * poked, and the socket doesn't have to be locked.
1745          */
1746         LOCK(&manager->fdlock[lockid]);
1747         manager->fds[fd] = NULL;
1748         if (type == isc_sockettype_fdwatch)
1749                 manager->fdstate[fd] = CLOSED;
1750         else
1751                 manager->fdstate[fd] = CLOSE_PENDING;
1752         UNLOCK(&manager->fdlock[lockid]);
1753         if (type == isc_sockettype_fdwatch) {
1754                 /*
1755                  * The caller may close the socket once this function returns,
1756                  * and `fd' may be reassigned for a new socket.  So we do
1757                  * unwatch_fd() here, rather than defer it via select_poke().
1758                  * Note: this may complicate data protection among threads and
1759                  * may reduce performance due to additional locks.  One way to
1760                  * solve this would be to dup() the watched descriptor, but we
1761                  * take a simpler approach at this moment.
1762                  */
1763                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1764                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1765         } else
1766                 select_poke(manager, fd, SELECT_POKE_CLOSE);
1767
1768         inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]);
1769
1770         /*
1771          * update manager->maxfd here (XXX: this should be implemented more
1772          * efficiently)
1773          */
1774 #ifdef USE_SELECT
1775         LOCK(&manager->lock);
1776         if (manager->maxfd == fd) {
1777                 int i;
1778
1779                 manager->maxfd = 0;
1780                 for (i = fd - 1; i >= 0; i--) {
1781                         lockid = FDLOCK_ID(i);
1782
1783                         LOCK(&manager->fdlock[lockid]);
1784                         if (manager->fdstate[i] == MANAGED) {
1785                                 manager->maxfd = i;
1786                                 UNLOCK(&manager->fdlock[lockid]);
1787                                 break;
1788                         }
1789                         UNLOCK(&manager->fdlock[lockid]);
1790                 }
1791 #ifdef ISC_PLATFORM_USETHREADS
1792                 if (manager->maxfd < manager->pipe_fds[0])
1793                         manager->maxfd = manager->pipe_fds[0];
1794 #endif
1795         }
1796         UNLOCK(&manager->lock);
1797 #endif  /* USE_SELECT */
1798 }
1799
1800 static void
1801 destroy(isc_socket_t **sockp) {
1802         int fd;
1803         isc_socket_t *sock = *sockp;
1804         isc_socketmgr_t *manager = sock->manager;
1805
1806         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1807                    ISC_MSG_DESTROYING, "destroying");
1808
1809         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1810         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1811         INSIST(ISC_LIST_EMPTY(sock->send_list));
1812         INSIST(sock->connect_ev == NULL);
1813         REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks);
1814
1815         if (sock->fd >= 0) {
1816                 fd = sock->fd;
1817                 sock->fd = -1;
1818                 closesocket(manager, sock, fd);
1819         }
1820
1821         LOCK(&manager->lock);
1822
1823         ISC_LIST_UNLINK(manager->socklist, sock, link);
1824
1825 #ifdef ISC_PLATFORM_USETHREADS
1826         if (ISC_LIST_EMPTY(manager->socklist))
1827                 SIGNAL(&manager->shutdown_ok);
1828 #endif /* ISC_PLATFORM_USETHREADS */
1829
1830         UNLOCK(&manager->lock);
1831
1832         free_socket(sockp);
1833 }
1834
1835 static isc_result_t
1836 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1837                 isc_socket_t **socketp)
1838 {
1839         isc_socket_t *sock;
1840         isc_result_t result;
1841         ISC_SOCKADDR_LEN_T cmsgbuflen;
1842
1843         sock = isc_mem_get(manager->mctx, sizeof(*sock));
1844
1845         if (sock == NULL)
1846                 return (ISC_R_NOMEMORY);
1847
1848         result = ISC_R_UNEXPECTED;
1849
1850         sock->magic = 0;
1851         sock->references = 0;
1852
1853         sock->manager = manager;
1854         sock->type = type;
1855         sock->fd = -1;
1856         sock->statsindex = NULL;
1857
1858         ISC_LINK_INIT(sock, link);
1859
1860         sock->recvcmsgbuf = NULL;
1861         sock->sendcmsgbuf = NULL;
1862
1863         /*
1864          * set up cmsg buffers
1865          */
1866         cmsgbuflen = 0;
1867 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1868         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1869 #endif
1870 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
1871         cmsgbuflen += cmsg_space(sizeof(struct timeval));
1872 #endif
1873         sock->recvcmsgbuflen = cmsgbuflen;
1874         if (sock->recvcmsgbuflen != 0U) {
1875                 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1876                 if (sock->recvcmsgbuf == NULL)
1877                         goto error;
1878         }
1879
1880         cmsgbuflen = 0;
1881 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1882         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1883 #endif
1884         sock->sendcmsgbuflen = cmsgbuflen;
1885         if (sock->sendcmsgbuflen != 0U) {
1886                 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1887                 if (sock->sendcmsgbuf == NULL)
1888                         goto error;
1889         }
1890
1891         memset(sock->name, 0, sizeof(sock->name));
1892         sock->tag = NULL;
1893
1894         /*
1895          * set up list of readers and writers to be initially empty
1896          */
1897         ISC_LIST_INIT(sock->recv_list);
1898         ISC_LIST_INIT(sock->send_list);
1899         ISC_LIST_INIT(sock->accept_list);
1900         sock->connect_ev = NULL;
1901         sock->pending_recv = 0;
1902         sock->pending_send = 0;
1903         sock->pending_accept = 0;
1904         sock->listener = 0;
1905         sock->connected = 0;
1906         sock->connecting = 0;
1907         sock->bound = 0;
1908
1909         /*
1910          * initialize the lock
1911          */
1912         result = isc_mutex_init(&sock->lock);
1913         if (result != ISC_R_SUCCESS) {
1914                 sock->magic = 0;
1915                 goto error;
1916         }
1917
1918         /*
1919          * Initialize readable and writable events
1920          */
1921         ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1922                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1923                        NULL, sock, sock, NULL, NULL);
1924         ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1925                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1926                        NULL, sock, sock, NULL, NULL);
1927
1928         sock->magic = SOCKET_MAGIC;
1929         *socketp = sock;
1930
1931         return (ISC_R_SUCCESS);
1932
1933  error:
1934         if (sock->recvcmsgbuf != NULL)
1935                 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
1936                             sock->recvcmsgbuflen);
1937         if (sock->sendcmsgbuf != NULL)
1938                 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
1939                             sock->sendcmsgbuflen);
1940         isc_mem_put(manager->mctx, sock, sizeof(*sock));
1941
1942         return (result);
1943 }
1944
1945 /*
1946  * This event requires that the various lists be empty, that the reference
1947  * count be 1, and that the magic number is valid.  The other socket bits,
1948  * like the lock, must be initialized as well.  The fd associated must be
1949  * marked as closed, by setting it to -1 on close, or this routine will
1950  * also close the socket.
1951  */
1952 static void
1953 free_socket(isc_socket_t **socketp) {
1954         isc_socket_t *sock = *socketp;
1955
1956         INSIST(sock->references == 0);
1957         INSIST(VALID_SOCKET(sock));
1958         INSIST(!sock->connecting);
1959         INSIST(!sock->pending_recv);
1960         INSIST(!sock->pending_send);
1961         INSIST(!sock->pending_accept);
1962         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1963         INSIST(ISC_LIST_EMPTY(sock->send_list));
1964         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1965         INSIST(!ISC_LINK_LINKED(sock, link));
1966
1967         if (sock->recvcmsgbuf != NULL)
1968                 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
1969                             sock->recvcmsgbuflen);
1970         if (sock->sendcmsgbuf != NULL)
1971                 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
1972                             sock->sendcmsgbuflen);
1973
1974         sock->magic = 0;
1975
1976         DESTROYLOCK(&sock->lock);
1977
1978         isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1979
1980         *socketp = NULL;
1981 }
1982
1983 #ifdef SO_BSDCOMPAT
1984 /*
1985  * This really should not be necessary to do.  Having to workout
1986  * which kernel version we are on at run time so that we don't cause
1987  * the kernel to issue a warning about us using a deprecated socket option.
1988  * Such warnings should *never* be on by default in production kernels.
1989  *
1990  * We can't do this a build time because executables are moved between
1991  * machines and hence kernels.
1992  *
1993  * We can't just not set SO_BSDCOMAT because some kernels require it.
1994  */
1995
1996 static isc_once_t         bsdcompat_once = ISC_ONCE_INIT;
1997 isc_boolean_t bsdcompat = ISC_TRUE;
1998
1999 static void
2000 clear_bsdcompat(void) {
2001 #ifdef __linux__
2002          struct utsname buf;
2003          char *endp;
2004          long int major;
2005          long int minor;
2006
2007          uname(&buf);    /* Can only fail if buf is bad in Linux. */
2008
2009          /* Paranoia in parsing can be increased, but we trust uname(). */
2010          major = strtol(buf.release, &endp, 10);
2011          if (*endp == '.') {
2012                 minor = strtol(endp+1, &endp, 10);
2013                 if ((major > 2) || ((major == 2) && (minor >= 4))) {
2014                         bsdcompat = ISC_FALSE;
2015                 }
2016          }
2017 #endif /* __linux __ */
2018 }
2019 #endif
2020
2021 static isc_result_t
2022 opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) {
2023         char strbuf[ISC_STRERRORSIZE];
2024         const char *err = "socket";
2025         int tries = 0;
2026 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
2027         int on = 1;
2028 #endif
2029 #if defined(SO_RCVBUF)
2030         ISC_SOCKADDR_LEN_T optlen;
2031         int size;
2032 #endif
2033
2034  again:
2035         switch (sock->type) {
2036         case isc_sockettype_udp:
2037                 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2038                 break;
2039         case isc_sockettype_tcp:
2040                 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2041                 break;
2042         case isc_sockettype_unix:
2043                 sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2044                 break;
2045         case isc_sockettype_fdwatch:
2046                 /*
2047                  * We should not be called for isc_sockettype_fdwatch sockets.
2048                  */
2049                 INSIST(0);
2050                 break;
2051         }
2052         if (sock->fd == -1 && errno == EINTR && tries++ < 42)
2053                 goto again;
2054
2055 #ifdef F_DUPFD
2056         /*
2057          * Leave a space for stdio and TCP to work in.
2058          */
2059         if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2060             sock->fd >= 0 && sock->fd < manager->reserved) {
2061                 int new, tmp;
2062                 new = fcntl(sock->fd, F_DUPFD, manager->reserved);
2063                 tmp = errno;
2064                 (void)close(sock->fd);
2065                 errno = tmp;
2066                 sock->fd = new;
2067                 err = "isc_socket_create: fcntl/reserved";
2068         } else if (sock->fd >= 0 && sock->fd < 20) {
2069                 int new, tmp;
2070                 new = fcntl(sock->fd, F_DUPFD, 20);
2071                 tmp = errno;
2072                 (void)close(sock->fd);
2073                 errno = tmp;
2074                 sock->fd = new;
2075                 err = "isc_socket_create: fcntl";
2076         }
2077 #endif
2078
2079         if (sock->fd >= (int)manager->maxsocks) {
2080                 (void)close(sock->fd);
2081                 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2082                                ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2083                                isc_msgcat, ISC_MSGSET_SOCKET,
2084                                ISC_MSG_TOOMANYFDS,
2085                                "socket: file descriptor exceeds limit (%d/%u)",
2086                                sock->fd, manager->maxsocks);
2087                 return (ISC_R_NORESOURCES);
2088         }
2089
2090         if (sock->fd < 0) {
2091                 switch (errno) {
2092                 case EMFILE:
2093                 case ENFILE:
2094                         isc__strerror(errno, strbuf, sizeof(strbuf));
2095                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2096                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2097                                        isc_msgcat, ISC_MSGSET_SOCKET,
2098                                        ISC_MSG_TOOMANYFDS,
2099                                        "%s: %s", err, strbuf);
2100                         /* fallthrough */
2101                 case ENOBUFS:
2102                         return (ISC_R_NORESOURCES);
2103
2104                 case EPROTONOSUPPORT:
2105                 case EPFNOSUPPORT:
2106                 case EAFNOSUPPORT:
2107                 /*
2108                  * Linux 2.2 (and maybe others) return EINVAL instead of
2109                  * EAFNOSUPPORT.
2110                  */
2111                 case EINVAL:
2112                         return (ISC_R_FAMILYNOSUPPORT);
2113
2114                 default:
2115                         isc__strerror(errno, strbuf, sizeof(strbuf));
2116                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2117                                          "%s() %s: %s", err,
2118                                          isc_msgcat_get(isc_msgcat,
2119                                                         ISC_MSGSET_GENERAL,
2120                                                         ISC_MSG_FAILED,
2121                                                         "failed"),
2122                                          strbuf);
2123                         return (ISC_R_UNEXPECTED);
2124                 }
2125         }
2126
2127         if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
2128                 (void)close(sock->fd);
2129                 return (ISC_R_UNEXPECTED);
2130         }
2131
2132 #ifdef SO_BSDCOMPAT
2133         RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
2134                                   clear_bsdcompat) == ISC_R_SUCCESS);
2135         if (sock->type != isc_sockettype_unix && bsdcompat &&
2136             setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
2137                        (void *)&on, sizeof(on)) < 0) {
2138                 isc__strerror(errno, strbuf, sizeof(strbuf));
2139                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2140                                  "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
2141                                  sock->fd,
2142                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2143                                                 ISC_MSG_FAILED, "failed"),
2144                                  strbuf);
2145                 /* Press on... */
2146         }
2147 #endif
2148
2149 #ifdef SO_NOSIGPIPE
2150         if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE,
2151                        (void *)&on, sizeof(on)) < 0) {
2152                 isc__strerror(errno, strbuf, sizeof(strbuf));
2153                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2154                                  "setsockopt(%d, SO_NOSIGPIPE) %s: %s",
2155                                  sock->fd,
2156                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2157                                                 ISC_MSG_FAILED, "failed"),
2158                                  strbuf);
2159                 /* Press on... */
2160         }
2161 #endif
2162
2163 #if defined(USE_CMSG) || defined(SO_RCVBUF)
2164         if (sock->type == isc_sockettype_udp) {
2165
2166 #if defined(USE_CMSG)
2167 #if defined(SO_TIMESTAMP)
2168                 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
2169                                (void *)&on, sizeof(on)) < 0
2170                     && errno != ENOPROTOOPT) {
2171                         isc__strerror(errno, strbuf, sizeof(strbuf));
2172                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2173                                          "setsockopt(%d, SO_TIMESTAMP) %s: %s",
2174                                          sock->fd,
2175                                          isc_msgcat_get(isc_msgcat,
2176                                                         ISC_MSGSET_GENERAL,
2177                                                         ISC_MSG_FAILED,
2178                                                         "failed"),
2179                                          strbuf);
2180                         /* Press on... */
2181                 }
2182 #endif /* SO_TIMESTAMP */
2183
2184 #if defined(ISC_PLATFORM_HAVEIPV6)
2185                 if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
2186                         /*
2187                          * Warn explicitly because this anomaly can be hidden
2188                          * in usual operation (and unexpectedly appear later).
2189                          */
2190                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2191                                          "No buffer available to receive "
2192                                          "IPv6 destination");
2193                 }
2194 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
2195 #ifdef IPV6_RECVPKTINFO
2196                 /* RFC 3542 */
2197                 if ((sock->pf == AF_INET6)
2198                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2199                                    (void *)&on, sizeof(on)) < 0)) {
2200                         isc__strerror(errno, strbuf, sizeof(strbuf));
2201                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2202                                          "setsockopt(%d, IPV6_RECVPKTINFO) "
2203                                          "%s: %s", sock->fd,
2204                                          isc_msgcat_get(isc_msgcat,
2205                                                         ISC_MSGSET_GENERAL,
2206                                                         ISC_MSG_FAILED,
2207                                                         "failed"),
2208                                          strbuf);
2209                 }
2210 #else
2211                 /* RFC 2292 */
2212                 if ((sock->pf == AF_INET6)
2213                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2214                                    (void *)&on, sizeof(on)) < 0)) {
2215                         isc__strerror(errno, strbuf, sizeof(strbuf));
2216                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2217                                          "setsockopt(%d, IPV6_PKTINFO) %s: %s",
2218                                          sock->fd,
2219                                          isc_msgcat_get(isc_msgcat,
2220                                                         ISC_MSGSET_GENERAL,
2221                                                         ISC_MSG_FAILED,
2222                                                         "failed"),
2223                                          strbuf);
2224                 }
2225 #endif /* IPV6_RECVPKTINFO */
2226 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
2227 #ifdef IPV6_USE_MIN_MTU        /* RFC 3542, not too common yet*/
2228                 /* use minimum MTU */
2229                 if (sock->pf == AF_INET6) {
2230                         (void)setsockopt(sock->fd, IPPROTO_IPV6,
2231                                          IPV6_USE_MIN_MTU,
2232                                          (void *)&on, sizeof(on));
2233                 }
2234 #endif
2235 #endif /* ISC_PLATFORM_HAVEIPV6 */
2236 #endif /* defined(USE_CMSG) */
2237
2238 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
2239                 /*
2240                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2241                  */
2242                 if (sock->pf == AF_INET) {
2243                         int action = IP_PMTUDISC_DONT;
2244                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2245                                          &action, sizeof(action));
2246                 }
2247 #endif
2248 #if defined(IP_DONTFRAG)
2249                 /*
2250                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2251                  */
2252                 if (sock->pf == AF_INET) {
2253                         int off = 0;
2254                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG,
2255                                          &off, sizeof(off));
2256                 }
2257 #endif
2258
2259 #if defined(SO_RCVBUF)
2260                 optlen = sizeof(size);
2261                 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2262                                (void *)&size, &optlen) >= 0 &&
2263                      size < RCVBUFSIZE) {
2264                         size = RCVBUFSIZE;
2265                         if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2266                                        (void *)&size, sizeof(size)) == -1) {
2267                                 isc__strerror(errno, strbuf, sizeof(strbuf));
2268                                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2269                                         "setsockopt(%d, SO_RCVBUF, %d) %s: %s",
2270                                         sock->fd, size,
2271                                         isc_msgcat_get(isc_msgcat,
2272                                                        ISC_MSGSET_GENERAL,
2273                                                        ISC_MSG_FAILED,
2274                                                        "failed"),
2275                                         strbuf);
2276                         }
2277                 }
2278 #endif
2279         }
2280 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
2281
2282         inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2283
2284         return (ISC_R_SUCCESS);
2285 }
2286
2287 /*%
2288  * Create a new 'type' socket managed by 'manager'.  Events
2289  * will be posted to 'task' and when dispatched 'action' will be
2290  * called with 'arg' as the arg value.  The new socket is returned
2291  * in 'socketp'.
2292  */
2293 isc_result_t
2294 isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
2295                   isc_socket_t **socketp)
2296 {
2297         isc_socket_t *sock = NULL;
2298         isc_result_t result;
2299         int lockid;
2300
2301         REQUIRE(VALID_MANAGER(manager));
2302         REQUIRE(socketp != NULL && *socketp == NULL);
2303         REQUIRE(type != isc_sockettype_fdwatch);
2304
2305         result = allocate_socket(manager, type, &sock);
2306         if (result != ISC_R_SUCCESS)
2307                 return (result);
2308
2309         switch (sock->type) {
2310         case isc_sockettype_udp:
2311                 sock->statsindex =
2312                         (pf == AF_INET) ? upd4statsindex : upd6statsindex;
2313                 break;
2314         case isc_sockettype_tcp:
2315                 sock->statsindex =
2316                         (pf == AF_INET) ? tcp4statsindex : tcp6statsindex;
2317                 break;
2318         case isc_sockettype_unix:
2319                 sock->statsindex = unixstatsindex;
2320                 break;
2321         default:
2322                 INSIST(0);
2323         }
2324
2325         sock->pf = pf;
2326         result = opensocket(manager, sock);
2327         if (result != ISC_R_SUCCESS) {
2328                 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2329                 free_socket(&sock);
2330                 return (result);
2331         }
2332
2333         sock->references = 1;
2334         *socketp = sock;
2335
2336         /*
2337          * Note we don't have to lock the socket like we normally would because
2338          * there are no external references to it yet.
2339          */
2340
2341         lockid = FDLOCK_ID(sock->fd);
2342         LOCK(&manager->fdlock[lockid]);
2343         manager->fds[sock->fd] = sock;
2344         manager->fdstate[sock->fd] = MANAGED;
2345 #ifdef USE_DEVPOLL
2346         INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2347                sock->manager->fdpollinfo[sock->fd].want_write == 0);
2348 #endif
2349         UNLOCK(&manager->fdlock[lockid]);
2350
2351         LOCK(&manager->lock);
2352         ISC_LIST_APPEND(manager->socklist, sock, link);
2353 #ifdef USE_SELECT
2354         if (manager->maxfd < sock->fd)
2355                 manager->maxfd = sock->fd;
2356 #endif
2357         UNLOCK(&manager->lock);
2358
2359         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2360                    ISC_MSG_CREATED, "created");
2361
2362         return (ISC_R_SUCCESS);
2363 }
2364
2365 isc_result_t
2366 isc_socket_open(isc_socket_t *sock) {
2367         isc_result_t result;
2368
2369         REQUIRE(VALID_SOCKET(sock));
2370
2371         LOCK(&sock->lock);
2372         REQUIRE(sock->references == 1);
2373         REQUIRE(sock->type != isc_sockettype_fdwatch);
2374         UNLOCK(&sock->lock);
2375         /*
2376          * We don't need to retain the lock hereafter, since no one else has
2377          * this socket.
2378          */
2379         REQUIRE(sock->fd == -1);
2380
2381         result = opensocket(sock->manager, sock);
2382         if (result != ISC_R_SUCCESS)
2383                 sock->fd = -1;
2384
2385         if (result == ISC_R_SUCCESS) {
2386                 int lockid = FDLOCK_ID(sock->fd);
2387
2388                 LOCK(&sock->manager->fdlock[lockid]);
2389                 sock->manager->fds[sock->fd] = sock;
2390                 sock->manager->fdstate[sock->fd] = MANAGED;
2391 #ifdef USE_DEVPOLL
2392                 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2393                        sock->manager->fdpollinfo[sock->fd].want_write == 0);
2394 #endif
2395                 UNLOCK(&sock->manager->fdlock[lockid]);
2396
2397 #ifdef USE_SELECT
2398                 LOCK(&sock->manager->lock);
2399                 if (sock->manager->maxfd < sock->fd)
2400                         sock->manager->maxfd = sock->fd;
2401                 UNLOCK(&sock->manager->lock);
2402 #endif
2403         }
2404
2405         return (result);
2406 }
2407
2408 /*
2409  * Create a new 'type' socket managed by 'manager'.  Events
2410  * will be posted to 'task' and when dispatched 'action' will be
2411  * called with 'arg' as the arg value.  The new socket is returned
2412  * in 'socketp'.
2413  */
2414 isc_result_t
2415 isc_socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
2416                          isc_sockfdwatch_t callback, void *cbarg,
2417                          isc_task_t *task, isc_socket_t **socketp)
2418 {
2419         isc_socket_t *sock = NULL;
2420         isc_result_t result;
2421         int lockid;
2422
2423         REQUIRE(VALID_MANAGER(manager));
2424         REQUIRE(socketp != NULL && *socketp == NULL);
2425
2426         result = allocate_socket(manager, isc_sockettype_fdwatch, &sock);
2427         if (result != ISC_R_SUCCESS)
2428                 return (result);
2429
2430         sock->fd = fd;
2431         sock->fdwatcharg = cbarg;
2432         sock->fdwatchcb = callback;
2433         sock->fdwatchflags = flags;
2434         sock->fdwatchtask = task;
2435         sock->statsindex = fdwatchstatsindex;
2436
2437         sock->references = 1;
2438         *socketp = sock;
2439
2440         /*
2441          * Note we don't have to lock the socket like we normally would because
2442          * there are no external references to it yet.
2443          */
2444
2445         lockid = FDLOCK_ID(sock->fd);
2446         LOCK(&manager->fdlock[lockid]);
2447         manager->fds[sock->fd] = sock;
2448         manager->fdstate[sock->fd] = MANAGED;
2449         UNLOCK(&manager->fdlock[lockid]);
2450
2451         LOCK(&manager->lock);
2452         ISC_LIST_APPEND(manager->socklist, sock, link);
2453 #ifdef USE_SELECT
2454         if (manager->maxfd < sock->fd)
2455                 manager->maxfd = sock->fd;
2456 #endif
2457         UNLOCK(&manager->lock);
2458
2459         if (flags & ISC_SOCKFDWATCH_READ)
2460                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2461         if (flags & ISC_SOCKFDWATCH_WRITE)
2462                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2463
2464         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2465                    ISC_MSG_CREATED, "fdwatch-created");
2466
2467         return (ISC_R_SUCCESS);
2468 }
2469
2470 /*
2471  * Attach to a socket.  Caller must explicitly detach when it is done.
2472  */
2473 void
2474 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
2475         REQUIRE(VALID_SOCKET(sock));
2476         REQUIRE(socketp != NULL && *socketp == NULL);
2477
2478         LOCK(&sock->lock);
2479         sock->references++;
2480         UNLOCK(&sock->lock);
2481
2482         *socketp = sock;
2483 }
2484
2485 /*
2486  * Dereference a socket.  If this is the last reference to it, clean things
2487  * up by destroying the socket.
2488  */
2489 void
2490 isc_socket_detach(isc_socket_t **socketp) {
2491         isc_socket_t *sock;
2492         isc_boolean_t kill_socket = ISC_FALSE;
2493
2494         REQUIRE(socketp != NULL);
2495         sock = *socketp;
2496         REQUIRE(VALID_SOCKET(sock));
2497
2498         LOCK(&sock->lock);
2499         REQUIRE(sock->references > 0);
2500         sock->references--;
2501         if (sock->references == 0)
2502                 kill_socket = ISC_TRUE;
2503         UNLOCK(&sock->lock);
2504
2505         if (kill_socket)
2506                 destroy(&sock);
2507
2508         *socketp = NULL;
2509 }
2510
2511 isc_result_t
2512 isc_socket_close(isc_socket_t *sock) {
2513         int fd;
2514         isc_socketmgr_t *manager;
2515         isc_sockettype_t type;
2516
2517         REQUIRE(VALID_SOCKET(sock));
2518
2519         LOCK(&sock->lock);
2520
2521         REQUIRE(sock->references == 1);
2522         REQUIRE(sock->type != isc_sockettype_fdwatch);
2523         REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2524
2525         INSIST(!sock->connecting);
2526         INSIST(!sock->pending_recv);
2527         INSIST(!sock->pending_send);
2528         INSIST(!sock->pending_accept);
2529         INSIST(ISC_LIST_EMPTY(sock->recv_list));
2530         INSIST(ISC_LIST_EMPTY(sock->send_list));
2531         INSIST(ISC_LIST_EMPTY(sock->accept_list));
2532         INSIST(sock->connect_ev == NULL);
2533
2534         manager = sock->manager;
2535         type = sock->type;
2536         fd = sock->fd;
2537         sock->fd = -1;
2538         memset(sock->name, 0, sizeof(sock->name));
2539         sock->tag = NULL;
2540         sock->listener = 0;
2541         sock->connected = 0;
2542         sock->connecting = 0;
2543         sock->bound = 0;
2544         isc_sockaddr_any(&sock->peer_address);
2545
2546         UNLOCK(&sock->lock);
2547
2548         closesocket(manager, sock, fd);
2549
2550         return (ISC_R_SUCCESS);
2551 }
2552
2553 /*
2554  * I/O is possible on a given socket.  Schedule an event to this task that
2555  * will call an internal function to do the I/O.  This will charge the
2556  * task with the I/O operation and let our select loop handler get back
2557  * to doing something real as fast as possible.
2558  *
2559  * The socket and manager must be locked before calling this function.
2560  */
2561 static void
2562 dispatch_recv(isc_socket_t *sock) {
2563         intev_t *iev;
2564         isc_socketevent_t *ev;
2565         isc_task_t *sender;
2566
2567         INSIST(!sock->pending_recv);
2568
2569         if (sock->type != isc_sockettype_fdwatch) {
2570                 ev = ISC_LIST_HEAD(sock->recv_list);
2571                 if (ev == NULL)
2572                         return;
2573                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2574                            "dispatch_recv:  event %p -> task %p",
2575                            ev, ev->ev_sender);
2576                 sender = ev->ev_sender;
2577         } else {
2578                 sender = sock->fdwatchtask;
2579         }
2580
2581         sock->pending_recv = 1;
2582         iev = &sock->readable_ev;
2583
2584         sock->references++;
2585         iev->ev_sender = sock;
2586         if (sock->type == isc_sockettype_fdwatch)
2587                 iev->ev_action = internal_fdwatch_read;
2588         else
2589                 iev->ev_action = internal_recv;
2590         iev->ev_arg = sock;
2591
2592         isc_task_send(sender, (isc_event_t **)&iev);
2593 }
2594
2595 static void
2596 dispatch_send(isc_socket_t *sock) {
2597         intev_t *iev;
2598         isc_socketevent_t *ev;
2599         isc_task_t *sender;
2600
2601         INSIST(!sock->pending_send);
2602
2603         if (sock->type != isc_sockettype_fdwatch) {
2604                 ev = ISC_LIST_HEAD(sock->send_list);
2605                 if (ev == NULL)
2606                         return;
2607                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2608                            "dispatch_send:  event %p -> task %p",
2609                            ev, ev->ev_sender);
2610                 sender = ev->ev_sender;
2611         } else {
2612                 sender = sock->fdwatchtask;
2613         }
2614
2615         sock->pending_send = 1;
2616         iev = &sock->writable_ev;
2617
2618         sock->references++;
2619         iev->ev_sender = sock;
2620         if (sock->type == isc_sockettype_fdwatch)
2621                 iev->ev_action = internal_fdwatch_write;
2622         else
2623                 iev->ev_action = internal_send;
2624         iev->ev_arg = sock;
2625
2626         isc_task_send(sender, (isc_event_t **)&iev);
2627 }
2628
2629 /*
2630  * Dispatch an internal accept event.
2631  */
2632 static void
2633 dispatch_accept(isc_socket_t *sock) {
2634         intev_t *iev;
2635         isc_socket_newconnev_t *ev;
2636
2637         INSIST(!sock->pending_accept);
2638
2639         /*
2640          * Are there any done events left, or were they all canceled
2641          * before the manager got the socket lock?
2642          */
2643         ev = ISC_LIST_HEAD(sock->accept_list);
2644         if (ev == NULL)
2645                 return;
2646
2647         sock->pending_accept = 1;
2648         iev = &sock->readable_ev;
2649
2650         sock->references++;  /* keep socket around for this internal event */
2651         iev->ev_sender = sock;
2652         iev->ev_action = internal_accept;
2653         iev->ev_arg = sock;
2654
2655         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2656 }
2657
2658 static void
2659 dispatch_connect(isc_socket_t *sock) {
2660         intev_t *iev;
2661         isc_socket_connev_t *ev;
2662
2663         iev = &sock->writable_ev;
2664
2665         ev = sock->connect_ev;
2666         INSIST(ev != NULL); /* XXX */
2667
2668         INSIST(sock->connecting);
2669
2670         sock->references++;  /* keep socket around for this internal event */
2671         iev->ev_sender = sock;
2672         iev->ev_action = internal_connect;
2673         iev->ev_arg = sock;
2674
2675         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2676 }
2677
2678 /*
2679  * Dequeue an item off the given socket's read queue, set the result code
2680  * in the done event to the one provided, and send it to the task it was
2681  * destined for.
2682  *
2683  * If the event to be sent is on a list, remove it before sending.  If
2684  * asked to, send and detach from the socket as well.
2685  *
2686  * Caller must have the socket locked if the event is attached to the socket.
2687  */
2688 static void
2689 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2690         isc_task_t *task;
2691
2692         task = (*dev)->ev_sender;
2693
2694         (*dev)->ev_sender = sock;
2695
2696         if (ISC_LINK_LINKED(*dev, ev_link))
2697                 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2698
2699         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
2700             == ISC_SOCKEVENTATTR_ATTACHED)
2701                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
2702         else
2703                 isc_task_send(task, (isc_event_t **)dev);
2704 }
2705
2706 /*
2707  * See comments for send_recvdone_event() above.
2708  *
2709  * Caller must have the socket locked if the event is attached to the socket.
2710  */
2711 static void
2712 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2713         isc_task_t *task;
2714
2715         INSIST(dev != NULL && *dev != NULL);
2716
2717         task = (*dev)->ev_sender;
2718         (*dev)->ev_sender = sock;
2719
2720         if (ISC_LINK_LINKED(*dev, ev_link))
2721                 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
2722
2723         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
2724             == ISC_SOCKEVENTATTR_ATTACHED)
2725                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
2726         else
2727                 isc_task_send(task, (isc_event_t **)dev);
2728 }
2729
2730 /*
2731  * Call accept() on a socket, to get the new file descriptor.  The listen
2732  * socket is used as a prototype to create a new isc_socket_t.  The new
2733  * socket has one outstanding reference.  The task receiving the event
2734  * will be detached from just after the event is delivered.
2735  *
2736  * On entry to this function, the event delivered is the internal
2737  * readable event, and the first item on the accept_list should be
2738  * the done event we want to send.  If the list is empty, this is a no-op,
2739  * so just unlock and return.
2740  */
2741 static void
2742 internal_accept(isc_task_t *me, isc_event_t *ev) {
2743         isc_socket_t *sock;
2744         isc_socketmgr_t *manager;
2745         isc_socket_newconnev_t *dev;
2746         isc_task_t *task;
2747         ISC_SOCKADDR_LEN_T addrlen;
2748         int fd;
2749         isc_result_t result = ISC_R_SUCCESS;
2750         char strbuf[ISC_STRERRORSIZE];
2751         const char *err = "accept";
2752
2753         UNUSED(me);
2754
2755         sock = ev->ev_sender;
2756         INSIST(VALID_SOCKET(sock));
2757
2758         LOCK(&sock->lock);
2759         socket_log(sock, NULL, TRACE,
2760                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2761                    "internal_accept called, locked socket");
2762
2763         manager = sock->manager;
2764         INSIST(VALID_MANAGER(manager));
2765
2766         INSIST(sock->listener);
2767         INSIST(sock->pending_accept == 1);
2768         sock->pending_accept = 0;
2769
2770         INSIST(sock->references > 0);
2771         sock->references--;  /* the internal event is done with this socket */
2772         if (sock->references == 0) {
2773                 UNLOCK(&sock->lock);
2774                 destroy(&sock);
2775                 return;
2776         }
2777
2778         /*
2779          * Get the first item off the accept list.
2780          * If it is empty, unlock the socket and return.
2781          */
2782         dev = ISC_LIST_HEAD(sock->accept_list);
2783         if (dev == NULL) {
2784                 UNLOCK(&sock->lock);
2785                 return;
2786         }
2787
2788         /*
2789          * Try to accept the new connection.  If the accept fails with
2790          * EAGAIN or EINTR, simply poke the watcher to watch this socket
2791          * again.  Also ignore ECONNRESET, which has been reported to
2792          * be spuriously returned on Linux 2.2.19 although it is not
2793          * a documented error for accept().  ECONNABORTED has been
2794          * reported for Solaris 8.  The rest are thrown in not because
2795          * we have seen them but because they are ignored by other
2796          * daemons such as BIND 8 and Apache.
2797          */
2798
2799         addrlen = sizeof(dev->newsocket->peer_address.type);
2800         memset(&dev->newsocket->peer_address.type, 0, addrlen);
2801         fd = accept(sock->fd, &dev->newsocket->peer_address.type.sa,
2802                     (void *)&addrlen);
2803
2804 #ifdef F_DUPFD
2805         /*
2806          * Leave a space for stdio to work in.
2807          */
2808         if (fd >= 0 && fd < 20) {
2809                 int new, tmp;
2810                 new = fcntl(fd, F_DUPFD, 20);
2811                 tmp = errno;
2812                 (void)close(fd);
2813                 errno = tmp;
2814                 fd = new;
2815                 err = "accept/fcntl";
2816         }
2817 #endif
2818
2819         if (fd < 0) {
2820                 if (SOFT_ERROR(errno))
2821                         goto soft_error;
2822                 switch (errno) {
2823                 case ENFILE:
2824                 case EMFILE:
2825                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2826                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2827                                        isc_msgcat, ISC_MSGSET_SOCKET,
2828                                        ISC_MSG_TOOMANYFDS,
2829                                        "%s: too many open file descriptors",
2830                                        err);
2831                         goto soft_error;
2832
2833                 case ENOBUFS:
2834                 case ENOMEM:
2835                 case ECONNRESET:
2836                 case ECONNABORTED:
2837                 case EHOSTUNREACH:
2838                 case EHOSTDOWN:
2839                 case ENETUNREACH:
2840                 case ENETDOWN:
2841                 case ECONNREFUSED:
2842 #ifdef EPROTO
2843                 case EPROTO:
2844 #endif
2845 #ifdef ENONET
2846                 case ENONET:
2847 #endif
2848                         goto soft_error;
2849                 default:
2850                         break;
2851                 }
2852                 isc__strerror(errno, strbuf, sizeof(strbuf));
2853                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2854                                  "internal_accept: %s() %s: %s", err,
2855                                  isc_msgcat_get(isc_msgcat,
2856                                                 ISC_MSGSET_GENERAL,
2857                                                 ISC_MSG_FAILED,
2858                                                 "failed"),
2859                                  strbuf);
2860                 fd = -1;
2861                 result = ISC_R_UNEXPECTED;
2862         } else {
2863                 if (addrlen == 0U) {
2864                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2865                                          "internal_accept(): "
2866                                          "accept() failed to return "
2867                                          "remote address");
2868
2869                         (void)close(fd);
2870                         goto soft_error;
2871                 } else if (dev->newsocket->peer_address.type.sa.sa_family !=
2872                            sock->pf)
2873                 {
2874                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2875                                          "internal_accept(): "
2876                                          "accept() returned peer address "
2877                                          "family %u (expected %u)",
2878                                          dev->newsocket->peer_address.
2879                                          type.sa.sa_family,
2880                                          sock->pf);
2881                         (void)close(fd);
2882                         goto soft_error;
2883                 } else if (fd >= (int)manager->maxsocks) {
2884                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2885                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2886                                        isc_msgcat, ISC_MSGSET_SOCKET,
2887                                        ISC_MSG_TOOMANYFDS,
2888                                        "accept: "
2889                                        "file descriptor exceeds limit (%d/%u)",
2890                                        fd, manager->maxsocks);
2891                         (void)close(fd);
2892                         goto soft_error;
2893                 }
2894         }
2895
2896         if (fd != -1) {
2897                 dev->newsocket->peer_address.length = addrlen;
2898                 dev->newsocket->pf = sock->pf;
2899         }
2900
2901         /*
2902          * Pull off the done event.
2903          */
2904         ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2905
2906         /*
2907          * Poke watcher if there are more pending accepts.
2908          */
2909         if (!ISC_LIST_EMPTY(sock->accept_list))
2910                 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2911
2912         UNLOCK(&sock->lock);
2913
2914         if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) {
2915                 (void)close(fd);
2916                 fd = -1;
2917                 result = ISC_R_UNEXPECTED;
2918         }
2919
2920         /*
2921          * -1 means the new socket didn't happen.
2922          */
2923         if (fd != -1) {
2924                 int lockid = FDLOCK_ID(fd);
2925
2926                 LOCK(&manager->fdlock[lockid]);
2927                 manager->fds[fd] = dev->newsocket;
2928                 manager->fdstate[fd] = MANAGED;
2929                 UNLOCK(&manager->fdlock[lockid]);
2930
2931                 LOCK(&manager->lock);
2932                 ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
2933
2934                 dev->newsocket->fd = fd;
2935                 dev->newsocket->bound = 1;
2936                 dev->newsocket->connected = 1;
2937
2938                 /*
2939                  * Save away the remote address
2940                  */
2941                 dev->address = dev->newsocket->peer_address;
2942
2943 #ifdef USE_SELECT
2944                 if (manager->maxfd < fd)
2945                         manager->maxfd = fd;
2946 #endif
2947
2948                 socket_log(sock, &dev->newsocket->peer_address, CREATION,
2949                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2950                            "accepted connection, new socket %p",
2951                            dev->newsocket);
2952
2953                 UNLOCK(&manager->lock);
2954
2955                 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
2956         } else {
2957                 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
2958                 dev->newsocket->references--;
2959                 free_socket(&dev->newsocket);
2960         }
2961
2962         /*
2963          * Fill in the done event details and send it off.
2964          */
2965         dev->result = result;
2966         task = dev->ev_sender;
2967         dev->ev_sender = sock;
2968
2969         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2970         return;
2971
2972  soft_error:
2973         select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2974         UNLOCK(&sock->lock);
2975
2976         inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
2977         return;
2978 }
2979
2980 static void
2981 internal_recv(isc_task_t *me, isc_event_t *ev) {
2982         isc_socketevent_t *dev;
2983         isc_socket_t *sock;
2984
2985         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
2986
2987         sock = ev->ev_sender;
2988         INSIST(VALID_SOCKET(sock));
2989
2990         LOCK(&sock->lock);
2991         socket_log(sock, NULL, IOEVENT,
2992                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2993                    "internal_recv: task %p got event %p", me, ev);
2994
2995         INSIST(sock->pending_recv == 1);
2996         sock->pending_recv = 0;
2997
2998         INSIST(sock->references > 0);
2999         sock->references--;  /* the internal event is done with this socket */
3000         if (sock->references == 0) {
3001                 UNLOCK(&sock->lock);
3002                 destroy(&sock);
3003                 return;
3004         }
3005
3006         /*
3007          * Try to do as much I/O as possible on this socket.  There are no
3008          * limits here, currently.
3009          */
3010         dev = ISC_LIST_HEAD(sock->recv_list);
3011         while (dev != NULL) {
3012                 switch (doio_recv(sock, dev)) {
3013                 case DOIO_SOFT:
3014                         goto poke;
3015
3016                 case DOIO_EOF:
3017                         /*
3018                          * read of 0 means the remote end was closed.
3019                          * Run through the event queue and dispatch all
3020                          * the events with an EOF result code.
3021                          */
3022                         do {
3023                                 dev->result = ISC_R_EOF;
3024                                 send_recvdone_event(sock, &dev);
3025                                 dev = ISC_LIST_HEAD(sock->recv_list);
3026                         } while (dev != NULL);
3027                         goto poke;
3028
3029                 case DOIO_SUCCESS:
3030                 case DOIO_HARD:
3031                         send_recvdone_event(sock, &dev);
3032                         break;
3033                 }
3034
3035                 dev = ISC_LIST_HEAD(sock->recv_list);
3036         }
3037
3038  poke:
3039         if (!ISC_LIST_EMPTY(sock->recv_list))
3040                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3041
3042         UNLOCK(&sock->lock);
3043 }
3044
3045 static void
3046 internal_send(isc_task_t *me, isc_event_t *ev) {
3047         isc_socketevent_t *dev;
3048         isc_socket_t *sock;
3049
3050         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3051
3052         /*
3053          * Find out what socket this is and lock it.
3054          */
3055         sock = (isc_socket_t *)ev->ev_sender;
3056         INSIST(VALID_SOCKET(sock));
3057
3058         LOCK(&sock->lock);
3059         socket_log(sock, NULL, IOEVENT,
3060                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3061                    "internal_send: task %p got event %p", me, ev);
3062
3063         INSIST(sock->pending_send == 1);
3064         sock->pending_send = 0;
3065
3066         INSIST(sock->references > 0);
3067         sock->references--;  /* the internal event is done with this socket */
3068         if (sock->references == 0) {
3069                 UNLOCK(&sock->lock);
3070                 destroy(&sock);
3071                 return;
3072         }
3073
3074         /*
3075          * Try to do as much I/O as possible on this socket.  There are no
3076          * limits here, currently.
3077          */
3078         dev = ISC_LIST_HEAD(sock->send_list);
3079         while (dev != NULL) {
3080                 switch (doio_send(sock, dev)) {
3081                 case DOIO_SOFT:
3082                         goto poke;
3083
3084                 case DOIO_HARD:
3085                 case DOIO_SUCCESS:
3086                         send_senddone_event(sock, &dev);
3087                         break;
3088                 }
3089
3090                 dev = ISC_LIST_HEAD(sock->send_list);
3091         }
3092
3093  poke:
3094         if (!ISC_LIST_EMPTY(sock->send_list))
3095                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3096
3097         UNLOCK(&sock->lock);
3098 }
3099
3100 static void
3101 internal_fdwatch_write(isc_task_t *me, isc_event_t *ev) {
3102         isc_socket_t *sock;
3103         int more_data;
3104
3105         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3106
3107         /*
3108          * Find out what socket this is and lock it.
3109          */
3110         sock = (isc_socket_t *)ev->ev_sender;
3111         INSIST(VALID_SOCKET(sock));
3112
3113         LOCK(&sock->lock);
3114         socket_log(sock, NULL, IOEVENT,
3115                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3116                    "internal_fdwatch_write: task %p got event %p", me, ev);
3117
3118         INSIST(sock->pending_send == 1);
3119
3120         UNLOCK(&sock->lock);
3121         more_data = (sock->fdwatchcb)(me, sock, sock->fdwatcharg);
3122         LOCK(&sock->lock);
3123
3124         sock->pending_send = 0;
3125
3126         INSIST(sock->references > 0);
3127         sock->references--;  /* the internal event is done with this socket */
3128         if (sock->references == 0) {
3129                 UNLOCK(&sock->lock);
3130                 destroy(&sock);
3131                 return;
3132         }
3133
3134         if (more_data)
3135                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3136
3137         UNLOCK(&sock->lock);
3138 }
3139
3140 static void
3141 internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) {
3142         isc_socket_t *sock;
3143         int more_data;
3144
3145         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3146
3147         /*
3148          * Find out what socket this is and lock it.
3149          */
3150         sock = (isc_socket_t *)ev->ev_sender;
3151         INSIST(VALID_SOCKET(sock));
3152
3153         LOCK(&sock->lock);
3154         socket_log(sock, NULL, IOEVENT,
3155                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3156                    "internal_fdwatch_read: task %p got event %p", me, ev);
3157
3158         INSIST(sock->pending_recv == 1);
3159
3160         UNLOCK(&sock->lock);
3161         more_data = (sock->fdwatchcb)(me, sock, sock->fdwatcharg);
3162         LOCK(&sock->lock);
3163
3164         sock->pending_recv = 0;
3165
3166         INSIST(sock->references > 0);
3167         sock->references--;  /* the internal event is done with this socket */
3168         if (sock->references == 0) {
3169                 UNLOCK(&sock->lock);
3170                 destroy(&sock);
3171                 return;
3172         }
3173
3174         if (more_data)
3175                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3176
3177         UNLOCK(&sock->lock);
3178 }
3179
3180 /*
3181  * Process read/writes on each fd here.  Avoid locking
3182  * and unlocking twice if both reads and writes are possible.
3183  */
3184 static void
3185 process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable,
3186            isc_boolean_t writeable)
3187 {
3188         isc_socket_t *sock;
3189         isc_boolean_t unlock_sock;
3190         isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
3191         int lockid = FDLOCK_ID(fd);
3192
3193         /*
3194          * If the socket is going to be closed, don't do more I/O.
3195          */
3196         LOCK(&manager->fdlock[lockid]);
3197         if (manager->fdstate[fd] == CLOSE_PENDING) {
3198                 UNLOCK(&manager->fdlock[lockid]);
3199
3200                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3201                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3202                 return;
3203         }
3204
3205         sock = manager->fds[fd];
3206         unlock_sock = ISC_FALSE;
3207         if (readable) {
3208                 if (sock == NULL) {
3209                         unwatch_read = ISC_TRUE;
3210                         goto check_write;
3211                 }
3212                 unlock_sock = ISC_TRUE;
3213                 LOCK(&sock->lock);
3214                 if (!SOCK_DEAD(sock)) {
3215                         if (sock->listener)
3216                                 dispatch_accept(sock);
3217                         else
3218                                 dispatch_recv(sock);
3219                 }
3220                 unwatch_read = ISC_TRUE;
3221         }
3222 check_write:
3223         if (writeable) {
3224                 if (sock == NULL) {
3225                         unwatch_write = ISC_TRUE;
3226                         goto unlock_fd;
3227                 }
3228                 if (!unlock_sock) {
3229                         unlock_sock = ISC_TRUE;
3230                         LOCK(&sock->lock);
3231                 }
3232                 if (!SOCK_DEAD(sock)) {
3233                         if (sock->connecting)
3234                                 dispatch_connect(sock);
3235                         else
3236                                 dispatch_send(sock);
3237                 }
3238                 unwatch_write = ISC_TRUE;
3239         }
3240         if (unlock_sock)
3241                 UNLOCK(&sock->lock);
3242
3243  unlock_fd:
3244         UNLOCK(&manager->fdlock[lockid]);
3245         if (unwatch_read)
3246                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3247         if (unwatch_write)
3248                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3249
3250 }
3251
3252 #ifdef USE_KQUEUE
3253 static isc_boolean_t
3254 process_fds(isc_socketmgr_t *manager, struct kevent *events, int nevents) {
3255         int i;
3256         isc_boolean_t readable, writable;
3257         isc_boolean_t done = ISC_FALSE;
3258 #ifdef ISC_PLATFORM_USETHREADS
3259         isc_boolean_t have_ctlevent = ISC_FALSE;
3260 #endif
3261
3262         if (nevents == manager->nevents) {
3263                 /*
3264                  * This is not an error, but something unexpected.  If this
3265                  * happens, it may indicate the need for increasing
3266                  * ISC_SOCKET_MAXEVENTS.
3267                  */
3268                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3269                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3270                             "maximum number of FD events (%d) received",
3271                             nevents);
3272         }
3273
3274         for (i = 0; i < nevents; i++) {
3275                 REQUIRE(events[i].ident < manager->maxsocks);
3276 #ifdef ISC_PLATFORM_USETHREADS
3277                 if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) {
3278                         have_ctlevent = ISC_TRUE;
3279                         continue;
3280                 }
3281 #endif
3282                 readable = ISC_TF(events[i].filter == EVFILT_READ);
3283                 writable = ISC_TF(events[i].filter == EVFILT_WRITE);
3284                 process_fd(manager, events[i].ident, readable, writable);
3285         }
3286
3287 #ifdef ISC_PLATFORM_USETHREADS
3288         if (have_ctlevent)
3289                 done = process_ctlfd(manager);
3290 #endif
3291
3292         return (done);
3293 }
3294 #elif defined(USE_EPOLL)
3295 static isc_boolean_t
3296 process_fds(isc_socketmgr_t *manager, struct epoll_event *events, int nevents) {
3297         int i;
3298         isc_boolean_t done = ISC_FALSE;
3299 #ifdef ISC_PLATFORM_USETHREADS
3300         isc_boolean_t have_ctlevent = ISC_FALSE;
3301 #endif
3302
3303         if (nevents == manager->nevents) {
3304                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3305                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3306                             "maximum number of FD events (%d) received",
3307                             nevents);
3308         }
3309
3310         for (i = 0; i < nevents; i++) {
3311                 REQUIRE(events[i].data.fd < (int)manager->maxsocks);
3312 #ifdef ISC_PLATFORM_USETHREADS
3313                 if (events[i].data.fd == manager->pipe_fds[0]) {
3314                         have_ctlevent = ISC_TRUE;
3315                         continue;
3316                 }
3317 #endif
3318                 if ((events[i].events & EPOLLERR) != 0 ||
3319                     (events[i].events & EPOLLHUP) != 0) {
3320                         /*
3321                          * epoll does not set IN/OUT bits on an erroneous
3322                          * condition, so we need to try both anyway.  This is a
3323                          * bit inefficient, but should be okay for such rare
3324                          * events.  Note also that the read or write attempt
3325                          * won't block because we use non-blocking sockets.
3326                          */
3327                         events[i].events |= (EPOLLIN | EPOLLOUT);
3328                 }
3329                 process_fd(manager, events[i].data.fd,
3330                            (events[i].events & EPOLLIN) != 0,
3331                            (events[i].events & EPOLLOUT) != 0);
3332         }
3333
3334 #ifdef ISC_PLATFORM_USETHREADS
3335         if (have_ctlevent)
3336                 done = process_ctlfd(manager);
3337 #endif
3338
3339         return (done);
3340 }
3341 #elif defined(USE_DEVPOLL)
3342 static isc_boolean_t
3343 process_fds(isc_socketmgr_t *manager, struct pollfd *events, int nevents) {
3344         int i;
3345         isc_boolean_t done = ISC_FALSE;
3346 #ifdef ISC_PLATFORM_USETHREADS
3347         isc_boolean_t have_ctlevent = ISC_FALSE;
3348 #endif
3349
3350         if (nevents == manager->nevents) {
3351                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3352                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3353                             "maximum number of FD events (%d) received",
3354                             nevents);
3355         }
3356
3357         for (i = 0; i < nevents; i++) {
3358                 REQUIRE(events[i].fd < (int)manager->maxsocks);
3359 #ifdef ISC_PLATFORM_USETHREADS
3360                 if (events[i].fd == manager->pipe_fds[0]) {
3361                         have_ctlevent = ISC_TRUE;
3362                         continue;
3363                 }
3364 #endif
3365                 process_fd(manager, events[i].fd,
3366                            (events[i].events & POLLIN) != 0,
3367                            (events[i].events & POLLOUT) != 0);
3368         }
3369
3370 #ifdef ISC_PLATFORM_USETHREADS
3371         if (have_ctlevent)
3372                 done = process_ctlfd(manager);
3373 #endif
3374
3375         return (done);
3376 }
3377 #elif defined(USE_SELECT)
3378 static void
3379 process_fds(isc_socketmgr_t *manager, int maxfd,
3380             fd_set *readfds, fd_set *writefds)
3381 {
3382         int i;
3383
3384         REQUIRE(maxfd <= (int)manager->maxsocks);
3385
3386         for (i = 0; i < maxfd; i++) {
3387 #ifdef ISC_PLATFORM_USETHREADS
3388                 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
3389                         continue;
3390 #endif /* ISC_PLATFORM_USETHREADS */
3391                 process_fd(manager, i, FD_ISSET(i, readfds),
3392                            FD_ISSET(i, writefds));
3393         }
3394 }
3395 #endif
3396
3397 #ifdef ISC_PLATFORM_USETHREADS
3398 static isc_boolean_t
3399 process_ctlfd(isc_socketmgr_t *manager) {
3400         int msg, fd;
3401
3402         for (;;) {
3403                 select_readmsg(manager, &fd, &msg);
3404
3405                 manager_log(manager, IOEVENT,
3406                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
3407                                            ISC_MSG_WATCHERMSG,
3408                                            "watcher got message %d "
3409                                            "for socket %d"), msg, fd);
3410
3411                 /*
3412                  * Nothing to read?
3413                  */
3414                 if (msg == SELECT_POKE_NOTHING)
3415                         break;
3416
3417                 /*
3418                  * Handle shutdown message.  We really should
3419                  * jump out of this loop right away, but
3420                  * it doesn't matter if we have to do a little
3421                  * more work first.
3422                  */
3423                 if (msg == SELECT_POKE_SHUTDOWN)
3424                         return (ISC_TRUE);
3425
3426                 /*
3427                  * This is a wakeup on a socket.  Look
3428                  * at the event queue for both read and write,
3429                  * and decide if we need to watch on it now
3430                  * or not.
3431                  */
3432                 wakeup_socket(manager, fd, msg);
3433         }
3434
3435         return (ISC_FALSE);
3436 }
3437
3438 /*
3439  * This is the thread that will loop forever, always in a select or poll
3440  * call.
3441  *
3442  * When select returns something to do, track down what thread gets to do
3443  * this I/O and post the event to it.
3444  */
3445 static isc_threadresult_t
3446 watcher(void *uap) {
3447         isc_socketmgr_t *manager = uap;
3448         isc_boolean_t done;
3449         int ctlfd;
3450         int cc;
3451 #ifdef USE_KQUEUE
3452         const char *fnname = "kevent()";
3453 #elif defined (USE_EPOLL)
3454         const char *fnname = "epoll_wait()";
3455 #elif defined(USE_DEVPOLL)
3456         const char *fnname = "ioctl(DP_POLL)";
3457         struct dvpoll dvp;
3458 #elif defined (USE_SELECT)
3459         const char *fnname = "select()";
3460         int maxfd;
3461 #endif
3462         char strbuf[ISC_STRERRORSIZE];
3463 #ifdef ISC_SOCKET_USE_POLLWATCH
3464         pollstate_t pollstate = poll_idle;
3465 #endif
3466
3467         /*
3468          * Get the control fd here.  This will never change.
3469          */
3470         ctlfd = manager->pipe_fds[0];
3471         done = ISC_FALSE;
3472         while (!done) {
3473                 do {
3474 #ifdef USE_KQUEUE
3475                         cc = kevent(manager->kqueue_fd, NULL, 0,
3476                                     manager->events, manager->nevents, NULL);
3477 #elif defined(USE_EPOLL)
3478                         cc = epoll_wait(manager->epoll_fd, manager->events,
3479                                         manager->nevents, -1);
3480 #elif defined(USE_DEVPOLL)
3481                         dvp.dp_fds = manager->events;
3482                         dvp.dp_nfds = manager->nevents;
3483 #ifndef ISC_SOCKET_USE_POLLWATCH
3484                         dvp.dp_timeout = -1;
3485 #else
3486                         if (pollstate == poll_idle)
3487                                 dvp.dp_timeout = -1;
3488                         else
3489                                 dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT;
3490 #endif  /* ISC_SOCKET_USE_POLLWATCH */
3491                         cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
3492 #elif defined(USE_SELECT)
3493                         LOCK(&manager->lock);
3494                         memcpy(manager->read_fds_copy, manager->read_fds,
3495                                manager->fd_bufsize);
3496                         memcpy(manager->write_fds_copy, manager->write_fds,
3497                                manager->fd_bufsize);
3498                         maxfd = manager->maxfd + 1;
3499                         UNLOCK(&manager->lock);
3500
3501                         cc = select(maxfd, manager->read_fds_copy,
3502                                     manager->write_fds_copy, NULL, NULL);
3503 #endif  /* USE_KQUEUE */
3504
3505                         if (cc < 0 && !SOFT_ERROR(errno)) {
3506                                 isc__strerror(errno, strbuf, sizeof(strbuf));
3507                                 FATAL_ERROR(__FILE__, __LINE__,
3508                                             "%s %s: %s", fnname,
3509                                             isc_msgcat_get(isc_msgcat,
3510                                                            ISC_MSGSET_GENERAL,
3511                                                            ISC_MSG_FAILED,
3512                                                            "failed"), strbuf);
3513                         }
3514
3515 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3516                         if (cc == 0) {
3517                                 if (pollstate == poll_active)
3518                                         pollstate = poll_checking;
3519                                 else if (pollstate == poll_checking)
3520                                         pollstate = poll_idle;
3521                         } else if (cc > 0) {
3522                                 if (pollstate == poll_checking) {
3523                                         /*
3524                                          * XXX: We'd like to use a more
3525                                          * verbose log level as it's actually an
3526                                          * unexpected event, but the kernel bug
3527                                          * reportedly happens pretty frequently
3528                                          * (and it can also be a false positive)
3529                                          * so it would be just too noisy.
3530                                          */
3531                                         manager_log(manager,
3532                                                     ISC_LOGCATEGORY_GENERAL,
3533                                                     ISC_LOGMODULE_SOCKET,
3534                                                     ISC_LOG_DEBUG(1),
3535                                                     "unexpected POLL timeout");
3536                                 }
3537                                 pollstate = poll_active;
3538                         }
3539 #endif
3540                 } while (cc < 0);
3541
3542 #if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL)
3543                 done = process_fds(manager, manager->events, cc);
3544 #elif defined(USE_SELECT)
3545                 process_fds(manager, maxfd, manager->read_fds_copy,
3546                             manager->write_fds_copy);
3547
3548                 /*
3549                  * Process reads on internal, control fd.
3550                  */
3551                 if (FD_ISSET(ctlfd, manager->read_fds_copy))
3552                         done = process_ctlfd(manager);
3553 #endif
3554         }
3555
3556         manager_log(manager, TRACE, "%s",
3557                     isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3558                                    ISC_MSG_EXITING, "watcher exiting"));
3559
3560         return ((isc_threadresult_t)0);
3561 }
3562 #endif /* ISC_PLATFORM_USETHREADS */
3563
3564 void
3565 isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
3566
3567         REQUIRE(VALID_MANAGER(manager));
3568
3569         manager->reserved = reserved;
3570 }
3571
3572 /*
3573  * Create a new socket manager.
3574  */
3575
3576 static isc_result_t
3577 setup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
3578         isc_result_t result;
3579 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3580         char strbuf[ISC_STRERRORSIZE];
3581 #endif
3582
3583 #ifdef USE_KQUEUE
3584         manager->nevents = ISC_SOCKET_MAXEVENTS;
3585         manager->events = isc_mem_get(mctx, sizeof(struct kevent) *
3586                                       manager->nevents);
3587         if (manager->events == NULL)
3588                 return (ISC_R_NOMEMORY);
3589         manager->kqueue_fd = kqueue();
3590         if (manager->kqueue_fd == -1) {
3591                 result = isc__errno2result(errno);
3592                 isc__strerror(errno, strbuf, sizeof(strbuf));
3593                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3594                                  "kqueue %s: %s",
3595                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3596                                                 ISC_MSG_FAILED, "failed"),
3597                                  strbuf);
3598                 isc_mem_put(mctx, manager->events,
3599                             sizeof(struct kevent) * manager->nevents);
3600                 return (result);
3601         }
3602
3603 #ifdef ISC_PLATFORM_USETHREADS
3604         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3605         if (result != ISC_R_SUCCESS) {
3606                 close(manager->kqueue_fd);
3607                 isc_mem_put(mctx, manager->events,
3608                             sizeof(struct kevent) * manager->nevents);
3609                 return (result);
3610         }
3611 #endif  /* ISC_PLATFORM_USETHREADS */
3612 #elif defined(USE_EPOLL)
3613         manager->nevents = ISC_SOCKET_MAXEVENTS;
3614         manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) *
3615                                       manager->nevents);
3616         if (manager->events == NULL)
3617                 return (ISC_R_NOMEMORY);
3618         manager->epoll_fd = epoll_create(manager->nevents);
3619         if (manager->epoll_fd == -1) {
3620                 result = isc__errno2result(errno);
3621                 isc__strerror(errno, strbuf, sizeof(strbuf));
3622                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3623                                  "epoll_create %s: %s",
3624                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3625                                                 ISC_MSG_FAILED, "failed"),
3626                                  strbuf);
3627                 isc_mem_put(mctx, manager->events,
3628                             sizeof(struct epoll_event) * manager->nevents);
3629                 return (result);
3630         }
3631 #ifdef ISC_PLATFORM_USETHREADS
3632         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3633         if (result != ISC_R_SUCCESS) {
3634                 close(manager->epoll_fd);
3635                 isc_mem_put(mctx, manager->events,
3636                             sizeof(struct epoll_event) * manager->nevents);
3637                 return (result);
3638         }
3639 #endif  /* ISC_PLATFORM_USETHREADS */
3640 #elif defined(USE_DEVPOLL)
3641         /*
3642          * XXXJT: /dev/poll seems to reject large numbers of events,
3643          * so we should be careful about redefining ISC_SOCKET_MAXEVENTS.
3644          */
3645         manager->nevents = ISC_SOCKET_MAXEVENTS;
3646         manager->events = isc_mem_get(mctx, sizeof(struct pollfd) *
3647                                       manager->nevents);
3648         if (manager->events == NULL)
3649                 return (ISC_R_NOMEMORY);
3650         /*
3651          * Note: fdpollinfo should be able to support all possible FDs, so
3652          * it must have maxsocks entries (not nevents).
3653          */
3654         manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) *
3655                                           manager->maxsocks);
3656         if (manager->fdpollinfo == NULL) {
3657                 isc_mem_put(mctx, manager->events,
3658                             sizeof(struct pollfd) * manager->nevents);
3659                 return (ISC_R_NOMEMORY);
3660         }
3661         memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks);
3662         manager->devpoll_fd = open("/dev/poll", O_RDWR);
3663         if (manager->devpoll_fd == -1) {
3664                 result = isc__errno2result(errno);
3665                 isc__strerror(errno, strbuf, sizeof(strbuf));
3666                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3667                                  "open(/dev/poll) %s: %s",
3668                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3669                                                 ISC_MSG_FAILED, "failed"),
3670                                  strbuf);
3671                 isc_mem_put(mctx, manager->events,
3672                             sizeof(struct pollfd) * manager->nevents);
3673                 isc_mem_put(mctx, manager->fdpollinfo,
3674                             sizeof(pollinfo_t) * manager->maxsocks);
3675                 return (result);
3676         }
3677 #ifdef ISC_PLATFORM_USETHREADS
3678         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3679         if (result != ISC_R_SUCCESS) {
3680                 close(manager->devpoll_fd);
3681                 isc_mem_put(mctx, manager->events,
3682                             sizeof(struct pollfd) * manager->nevents);
3683                 isc_mem_put(mctx, manager->fdpollinfo,
3684                             sizeof(pollinfo_t) * manager->maxsocks);
3685                 return (result);
3686         }
3687 #endif  /* ISC_PLATFORM_USETHREADS */
3688 #elif defined(USE_SELECT)
3689         UNUSED(result);
3690
3691 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3692         /*
3693          * Note: this code should also cover the case of MAXSOCKETS <=
3694          * FD_SETSIZE, but we separate the cases to avoid possible portability
3695          * issues regarding howmany() and the actual representation of fd_set.
3696          */
3697         manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
3698                 sizeof(fd_mask);
3699 #else
3700         manager->fd_bufsize = sizeof(fd_set);
3701 #endif
3702
3703         manager->read_fds = NULL;
3704         manager->read_fds_copy = NULL;
3705         manager->write_fds = NULL;
3706         manager->write_fds_copy = NULL;
3707
3708         manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
3709         if (manager->read_fds != NULL)
3710                 manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
3711         if (manager->read_fds_copy != NULL)
3712                 manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
3713         if (manager->write_fds != NULL) {
3714                 manager->write_fds_copy = isc_mem_get(mctx,
3715                                                       manager->fd_bufsize);
3716         }
3717         if (manager->write_fds_copy == NULL) {
3718                 if (manager->write_fds != NULL) {
3719                         isc_mem_put(mctx, manager->write_fds,
3720                                     manager->fd_bufsize);
3721                 }
3722                 if (manager->read_fds_copy != NULL) {
3723                         isc_mem_put(mctx, manager->read_fds_copy,
3724                                     manager->fd_bufsize);
3725                 }
3726                 if (manager->read_fds != NULL) {
3727                         isc_mem_put(mctx, manager->read_fds,
3728                                     manager->fd_bufsize);
3729                 }
3730                 return (ISC_R_NOMEMORY);
3731         }
3732         memset(manager->read_fds, 0, manager->fd_bufsize);
3733         memset(manager->write_fds, 0, manager->fd_bufsize);
3734
3735 #ifdef ISC_PLATFORM_USETHREADS
3736         (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3737         manager->maxfd = manager->pipe_fds[0];
3738 #else /* ISC_PLATFORM_USETHREADS */
3739         manager->maxfd = 0;
3740 #endif /* ISC_PLATFORM_USETHREADS */
3741 #endif  /* USE_KQUEUE */
3742
3743         return (ISC_R_SUCCESS);
3744 }
3745
3746 static void
3747 cleanup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
3748 #ifdef ISC_PLATFORM_USETHREADS
3749         isc_result_t result;
3750
3751         result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3752         if (result != ISC_R_SUCCESS) {
3753                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3754                                  "epoll_ctl(DEL) %s",
3755                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3756                                                 ISC_MSG_FAILED, "failed"));
3757         }
3758 #endif  /* ISC_PLATFORM_USETHREADS */
3759
3760 #ifdef USE_KQUEUE
3761         close(manager->kqueue_fd);
3762         isc_mem_put(mctx, manager->events,
3763                     sizeof(struct kevent) * manager->nevents);
3764 #elif defined(USE_EPOLL)
3765         close(manager->epoll_fd);
3766         isc_mem_put(mctx, manager->events,
3767                     sizeof(struct epoll_event) * manager->nevents);
3768 #elif defined(USE_DEVPOLL)
3769         close(manager->devpoll_fd);
3770         isc_mem_put(mctx, manager->events,
3771                     sizeof(struct pollfd) * manager->nevents);
3772         isc_mem_put(mctx, manager->fdpollinfo,
3773                     sizeof(pollinfo_t) * manager->maxsocks);
3774 #elif defined(USE_SELECT)
3775         if (manager->read_fds != NULL)
3776                 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
3777         if (manager->read_fds_copy != NULL)
3778                 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize);
3779         if (manager->write_fds != NULL)
3780                 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
3781         if (manager->write_fds_copy != NULL)
3782                 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
3783 #endif  /* USE_KQUEUE */
3784 }
3785
3786 isc_result_t
3787 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
3788         return (isc_socketmgr_create2(mctx, managerp, 0));
3789 }
3790
3791 isc_result_t
3792 isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
3793                       unsigned int maxsocks)
3794 {
3795         int i;
3796         isc_socketmgr_t *manager;
3797 #ifdef ISC_PLATFORM_USETHREADS
3798         char strbuf[ISC_STRERRORSIZE];
3799 #endif
3800         isc_result_t result;
3801
3802         REQUIRE(managerp != NULL && *managerp == NULL);
3803
3804 #ifndef ISC_PLATFORM_USETHREADS
3805         if (socketmgr != NULL) {
3806                 /* Don't allow maxsocks to be updated */
3807                 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
3808                         return (ISC_R_EXISTS);
3809
3810                 socketmgr->refs++;
3811                 *managerp = socketmgr;
3812                 return (ISC_R_SUCCESS);
3813         }
3814 #endif /* ISC_PLATFORM_USETHREADS */
3815
3816         if (maxsocks == 0)
3817                 maxsocks = ISC_SOCKET_MAXSOCKETS;
3818
3819         manager = isc_mem_get(mctx, sizeof(*manager));
3820         if (manager == NULL)
3821                 return (ISC_R_NOMEMORY);
3822
3823         /* zero-clear so that necessary cleanup on failure will be easy */
3824         memset(manager, 0, sizeof(*manager));
3825         manager->maxsocks = maxsocks;
3826         manager->reserved = 0;
3827         manager->fds = isc_mem_get(mctx,
3828                                    manager->maxsocks * sizeof(isc_socket_t *));
3829         if (manager->fds == NULL) {
3830                 result = ISC_R_NOMEMORY;
3831                 goto free_manager;
3832         }
3833         manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int));
3834         if (manager->fdstate == NULL) {
3835                 result = ISC_R_NOMEMORY;
3836                 goto free_manager;
3837         }
3838         manager->stats = NULL;
3839
3840         manager->magic = SOCKET_MANAGER_MAGIC;
3841         manager->mctx = NULL;
3842         memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
3843         ISC_LIST_INIT(manager->socklist);
3844         result = isc_mutex_init(&manager->lock);
3845         if (result != ISC_R_SUCCESS)
3846                 goto free_manager;
3847         manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t));
3848         if (manager->fdlock == NULL) {
3849                 result = ISC_R_NOMEMORY;
3850                 goto cleanup_lock;
3851         }
3852         for (i = 0; i < FDLOCK_COUNT; i++) {
3853                 result = isc_mutex_init(&manager->fdlock[i]);
3854                 if (result != ISC_R_SUCCESS) {
3855                         while (--i >= 0)
3856                                 DESTROYLOCK(&manager->fdlock[i]);
3857                         isc_mem_put(mctx, manager->fdlock,
3858                                     FDLOCK_COUNT * sizeof(isc_mutex_t));
3859                         manager->fdlock = NULL;
3860                         goto cleanup_lock;
3861                 }
3862         }
3863
3864 #ifdef ISC_PLATFORM_USETHREADS
3865         if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
3866                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3867                                  "isc_condition_init() %s",
3868                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3869                                                 ISC_MSG_FAILED, "failed"));
3870                 result = ISC_R_UNEXPECTED;
3871                 goto cleanup_lock;
3872         }
3873
3874         /*
3875          * Create the special fds that will be used to wake up the
3876          * select/poll loop when something internal needs to be done.
3877          */
3878         if (pipe(manager->pipe_fds) != 0) {
3879                 isc__strerror(errno, strbuf, sizeof(strbuf));
3880                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3881                                  "pipe() %s: %s",
3882                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3883                                                 ISC_MSG_FAILED, "failed"),
3884                                  strbuf);
3885                 result = ISC_R_UNEXPECTED;
3886                 goto cleanup_condition;
3887         }
3888
3889         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
3890 #if 0
3891         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
3892 #endif
3893 #else /* ISC_PLATFORM_USETHREADS */
3894         manager->refs = 1;
3895 #endif /* ISC_PLATFORM_USETHREADS */
3896
3897         /*
3898          * Set up initial state for the select loop
3899          */
3900         result = setup_watcher(mctx, manager);
3901         if (result != ISC_R_SUCCESS)
3902                 goto cleanup;
3903         memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
3904 #ifdef ISC_PLATFORM_USETHREADS
3905         /*
3906          * Start up the select/poll thread.
3907          */
3908         if (isc_thread_create(watcher, manager, &manager->watcher) !=
3909             ISC_R_SUCCESS) {
3910                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3911                                  "isc_thread_create() %s",
3912                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3913                                                 ISC_MSG_FAILED, "failed"));
3914                 cleanup_watcher(mctx, manager);
3915                 result = ISC_R_UNEXPECTED;
3916                 goto cleanup;
3917         }
3918 #endif /* ISC_PLATFORM_USETHREADS */
3919         isc_mem_attach(mctx, &manager->mctx);
3920
3921 #ifndef ISC_PLATFORM_USETHREADS
3922         socketmgr = manager;
3923 #endif /* ISC_PLATFORM_USETHREADS */
3924         *managerp = manager;
3925
3926         return (ISC_R_SUCCESS);
3927
3928 cleanup:
3929 #ifdef ISC_PLATFORM_USETHREADS
3930         (void)close(manager->pipe_fds[0]);
3931         (void)close(manager->pipe_fds[1]);
3932 #endif  /* ISC_PLATFORM_USETHREADS */
3933
3934 #ifdef ISC_PLATFORM_USETHREADS
3935 cleanup_condition:
3936         (void)isc_condition_destroy(&manager->shutdown_ok);
3937 #endif  /* ISC_PLATFORM_USETHREADS */
3938
3939
3940 cleanup_lock:
3941         if (manager->fdlock != NULL) {
3942                 for (i = 0; i < FDLOCK_COUNT; i++)
3943                         DESTROYLOCK(&manager->fdlock[i]);
3944         }
3945         DESTROYLOCK(&manager->lock);
3946
3947 free_manager:
3948         if (manager->fdlock != NULL) {
3949                 isc_mem_put(mctx, manager->fdlock,
3950                             FDLOCK_COUNT * sizeof(isc_mutex_t));
3951         }
3952         if (manager->fdstate != NULL) {
3953                 isc_mem_put(mctx, manager->fdstate,
3954                             manager->maxsocks * sizeof(int));
3955         }
3956         if (manager->fds != NULL) {
3957                 isc_mem_put(mctx, manager->fds,
3958                             manager->maxsocks * sizeof(isc_socket_t *));
3959         }
3960         isc_mem_put(mctx, manager, sizeof(*manager));
3961
3962         return (result);
3963 }
3964
3965 isc_result_t
3966 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
3967         REQUIRE(VALID_MANAGER(manager));
3968         REQUIRE(nsockp != NULL);
3969
3970         *nsockp = manager->maxsocks;
3971
3972         return (ISC_R_SUCCESS);
3973 }
3974
3975 void
3976 isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
3977         REQUIRE(VALID_MANAGER(manager));
3978         REQUIRE(ISC_LIST_EMPTY(manager->socklist));
3979         REQUIRE(manager->stats == NULL);
3980         REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
3981
3982         isc_stats_attach(stats, &manager->stats);
3983 }
3984
3985 void
3986 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
3987         isc_socketmgr_t *manager;
3988         int i;
3989         isc_mem_t *mctx;
3990
3991         /*
3992          * Destroy a socket manager.
3993          */
3994
3995         REQUIRE(managerp != NULL);
3996         manager = *managerp;
3997         REQUIRE(VALID_MANAGER(manager));
3998
3999 #ifndef ISC_PLATFORM_USETHREADS
4000         if (manager->refs > 1) {
4001                 manager->refs--;
4002                 *managerp = NULL;
4003                 return;
4004         }
4005 #endif /* ISC_PLATFORM_USETHREADS */
4006
4007         LOCK(&manager->lock);
4008
4009 #ifdef ISC_PLATFORM_USETHREADS
4010         /*
4011          * Wait for all sockets to be destroyed.
4012          */
4013         while (!ISC_LIST_EMPTY(manager->socklist)) {
4014                 manager_log(manager, CREATION, "%s",
4015                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
4016                                            ISC_MSG_SOCKETSREMAIN,
4017                                            "sockets exist"));
4018                 WAIT(&manager->shutdown_ok, &manager->lock);
4019         }
4020 #else /* ISC_PLATFORM_USETHREADS */
4021         /*
4022          * Hope all sockets have been destroyed.
4023          */
4024         if (!ISC_LIST_EMPTY(manager->socklist)) {
4025                 manager_log(manager, CREATION, "%s",
4026                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
4027                                            ISC_MSG_SOCKETSREMAIN,
4028                                            "sockets exist"));
4029                 INSIST(0);
4030         }
4031 #endif /* ISC_PLATFORM_USETHREADS */
4032
4033         UNLOCK(&manager->lock);
4034
4035         /*
4036          * Here, poke our select/poll thread.  Do this by closing the write
4037          * half of the pipe, which will send EOF to the read half.
4038          * This is currently a no-op in the non-threaded case.
4039          */
4040         select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
4041
4042 #ifdef ISC_PLATFORM_USETHREADS
4043         /*
4044          * Wait for thread to exit.
4045          */
4046         if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
4047                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4048                                  "isc_thread_join() %s",
4049                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4050                                                 ISC_MSG_FAILED, "failed"));
4051 #endif /* ISC_PLATFORM_USETHREADS */
4052
4053         /*
4054          * Clean up.
4055          */
4056         cleanup_watcher(manager->mctx, manager);
4057
4058 #ifdef ISC_PLATFORM_USETHREADS
4059         (void)close(manager->pipe_fds[0]);
4060         (void)close(manager->pipe_fds[1]);
4061         (void)isc_condition_destroy(&manager->shutdown_ok);
4062 #endif /* ISC_PLATFORM_USETHREADS */
4063
4064         for (i = 0; i < (int)manager->maxsocks; i++)
4065                 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
4066                         (void)close(i);
4067
4068         isc_mem_put(manager->mctx, manager->fds,
4069                     manager->maxsocks * sizeof(isc_socket_t *));
4070         isc_mem_put(manager->mctx, manager->fdstate,
4071                     manager->maxsocks * sizeof(int));
4072
4073         if (manager->stats != NULL)
4074                 isc_stats_detach(&manager->stats);
4075
4076         if (manager->fdlock != NULL) {
4077                 for (i = 0; i < FDLOCK_COUNT; i++)
4078                         DESTROYLOCK(&manager->fdlock[i]);
4079                 isc_mem_put(manager->mctx, manager->fdlock,
4080                             FDLOCK_COUNT * sizeof(isc_mutex_t));
4081         }
4082         DESTROYLOCK(&manager->lock);
4083         manager->magic = 0;
4084         mctx= manager->mctx;
4085         isc_mem_put(mctx, manager, sizeof(*manager));
4086
4087         isc_mem_detach(&mctx);
4088
4089         *managerp = NULL;
4090 }
4091
4092 static isc_result_t
4093 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4094             unsigned int flags)
4095 {
4096         int io_state;
4097         isc_boolean_t have_lock = ISC_FALSE;
4098         isc_task_t *ntask = NULL;
4099         isc_result_t result = ISC_R_SUCCESS;
4100
4101         dev->ev_sender = task;
4102
4103         if (sock->type == isc_sockettype_udp) {
4104                 io_state = doio_recv(sock, dev);
4105         } else {
4106                 LOCK(&sock->lock);
4107                 have_lock = ISC_TRUE;
4108
4109                 if (ISC_LIST_EMPTY(sock->recv_list))
4110                         io_state = doio_recv(sock, dev);
4111                 else
4112                         io_state = DOIO_SOFT;
4113         }
4114
4115         switch (io_state) {
4116         case DOIO_SOFT:
4117                 /*
4118                  * We couldn't read all or part of the request right now, so
4119                  * queue it.
4120                  *
4121                  * Attach to socket and to task
4122                  */
4123                 isc_task_attach(task, &ntask);
4124                 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4125
4126                 if (!have_lock) {
4127                         LOCK(&sock->lock);
4128                         have_lock = ISC_TRUE;
4129                 }
4130
4131                 /*
4132                  * Enqueue the request.  If the socket was previously not being
4133                  * watched, poke the watcher to start paying attention to it.
4134                  */
4135                 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
4136                         select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
4137                 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
4138
4139                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
4140                            "socket_recv: event %p -> task %p",
4141                            dev, ntask);
4142
4143                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4144                         result = ISC_R_INPROGRESS;
4145                 break;
4146
4147         case DOIO_EOF:
4148                 dev->result = ISC_R_EOF;
4149                 /* fallthrough */
4150
4151         case DOIO_HARD:
4152         case DOIO_SUCCESS:
4153                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4154                         send_recvdone_event(sock, &dev);
4155                 break;
4156         }
4157
4158         if (have_lock)
4159                 UNLOCK(&sock->lock);
4160
4161         return (result);
4162 }
4163
4164 isc_result_t
4165 isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
4166                  unsigned int minimum, isc_task_t *task,
4167                  isc_taskaction_t action, const void *arg)
4168 {
4169         isc_socketevent_t *dev;
4170         isc_socketmgr_t *manager;
4171         unsigned int iocount;
4172         isc_buffer_t *buffer;
4173
4174         REQUIRE(VALID_SOCKET(sock));
4175         REQUIRE(buflist != NULL);
4176         REQUIRE(!ISC_LIST_EMPTY(*buflist));
4177         REQUIRE(task != NULL);
4178         REQUIRE(action != NULL);
4179
4180         manager = sock->manager;
4181         REQUIRE(VALID_MANAGER(manager));
4182
4183         iocount = isc_bufferlist_availablecount(buflist);
4184         REQUIRE(iocount > 0);
4185
4186         INSIST(sock->bound);
4187
4188         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4189         if (dev == NULL) {
4190                 return (ISC_R_NOMEMORY);
4191         }
4192
4193         /*
4194          * UDP sockets are always partial read
4195          */
4196         if (sock->type == isc_sockettype_udp)
4197                 dev->minimum = 1;
4198         else {
4199                 if (minimum == 0)
4200                         dev->minimum = iocount;
4201                 else
4202                         dev->minimum = minimum;
4203         }
4204
4205         /*
4206          * Move each buffer from the passed in list to our internal one.
4207          */
4208         buffer = ISC_LIST_HEAD(*buflist);
4209         while (buffer != NULL) {
4210                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
4211                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4212                 buffer = ISC_LIST_HEAD(*buflist);
4213         }
4214
4215         return (socket_recv(sock, dev, task, 0));
4216 }
4217
4218 isc_result_t
4219 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
4220                 isc_task_t *task, isc_taskaction_t action, const void *arg)
4221 {
4222         isc_socketevent_t *dev;
4223         isc_socketmgr_t *manager;
4224
4225         REQUIRE(VALID_SOCKET(sock));
4226         REQUIRE(action != NULL);
4227
4228         manager = sock->manager;
4229         REQUIRE(VALID_MANAGER(manager));
4230
4231         INSIST(sock->bound);
4232
4233         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4234         if (dev == NULL)
4235                 return (ISC_R_NOMEMORY);
4236
4237         return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
4238 }
4239
4240 isc_result_t
4241 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
4242                  unsigned int minimum, isc_task_t *task,
4243                  isc_socketevent_t *event, unsigned int flags)
4244 {
4245         event->ev_sender = sock;
4246         event->result = ISC_R_UNEXPECTED;
4247         ISC_LIST_INIT(event->bufferlist);
4248         event->region = *region;
4249         event->n = 0;
4250         event->offset = 0;
4251         event->attributes = 0;
4252
4253         /*
4254          * UDP sockets are always partial read.
4255          */
4256         if (sock->type == isc_sockettype_udp)
4257                 event->minimum = 1;
4258         else {
4259                 if (minimum == 0)
4260                         event->minimum = region->length;
4261                 else
4262                         event->minimum = minimum;
4263         }
4264
4265         return (socket_recv(sock, event, task, flags));
4266 }
4267
4268 static isc_result_t
4269 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4270             isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4271             unsigned int flags)
4272 {
4273         int io_state;
4274         isc_boolean_t have_lock = ISC_FALSE;
4275         isc_task_t *ntask = NULL;
4276         isc_result_t result = ISC_R_SUCCESS;
4277
4278         dev->ev_sender = task;
4279
4280         set_dev_address(address, sock, dev);
4281         if (pktinfo != NULL) {
4282                 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4283                 dev->pktinfo = *pktinfo;
4284
4285                 if (!isc_sockaddr_issitelocal(&dev->address) &&
4286                     !isc_sockaddr_islinklocal(&dev->address)) {
4287                         socket_log(sock, NULL, TRACE, isc_msgcat,
4288                                    ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
4289                                    "pktinfo structure provided, ifindex %u "
4290                                    "(set to 0)", pktinfo->ipi6_ifindex);
4291
4292                         /*
4293                          * Set the pktinfo index to 0 here, to let the
4294                          * kernel decide what interface it should send on.
4295                          */
4296                         dev->pktinfo.ipi6_ifindex = 0;
4297                 }
4298         }
4299
4300         if (sock->type == isc_sockettype_udp)
4301                 io_state = doio_send(sock, dev);
4302         else {
4303                 LOCK(&sock->lock);
4304                 have_lock = ISC_TRUE;
4305
4306                 if (ISC_LIST_EMPTY(sock->send_list))
4307                         io_state = doio_send(sock, dev);
4308                 else
4309                         io_state = DOIO_SOFT;
4310         }
4311
4312         switch (io_state) {
4313         case DOIO_SOFT:
4314                 /*
4315                  * We couldn't send all or part of the request right now, so
4316                  * queue it unless ISC_SOCKFLAG_NORETRY is set.
4317                  */
4318                 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4319                         isc_task_attach(task, &ntask);
4320                         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4321
4322                         if (!have_lock) {
4323                                 LOCK(&sock->lock);
4324                                 have_lock = ISC_TRUE;
4325                         }
4326
4327                         /*
4328                          * Enqueue the request.  If the socket was previously
4329                          * not being watched, poke the watcher to start
4330                          * paying attention to it.
4331                          */
4332                         if (ISC_LIST_EMPTY(sock->send_list) &&
4333                             !sock->pending_send)
4334                                 select_poke(sock->manager, sock->fd,
4335                                             SELECT_POKE_WRITE);
4336                         ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4337
4338                         socket_log(sock, NULL, EVENT, NULL, 0, 0,
4339                                    "socket_send: event %p -> task %p",
4340                                    dev, ntask);
4341
4342                         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4343                                 result = ISC_R_INPROGRESS;
4344                         break;
4345                 }
4346
4347         case DOIO_HARD:
4348         case DOIO_SUCCESS:
4349                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4350                         send_senddone_event(sock, &dev);
4351                 break;
4352         }
4353
4354         if (have_lock)
4355                 UNLOCK(&sock->lock);
4356
4357         return (result);
4358 }
4359
4360 isc_result_t
4361 isc_socket_send(isc_socket_t *sock, isc_region_t *region,
4362                 isc_task_t *task, isc_taskaction_t action, const void *arg)
4363 {
4364         /*
4365          * REQUIRE() checking is performed in isc_socket_sendto().
4366          */
4367         return (isc_socket_sendto(sock, region, task, action, arg, NULL,
4368                                   NULL));
4369 }
4370
4371 isc_result_t
4372 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
4373                   isc_task_t *task, isc_taskaction_t action, const void *arg,
4374                   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4375 {
4376         isc_socketevent_t *dev;
4377         isc_socketmgr_t *manager;
4378
4379         REQUIRE(VALID_SOCKET(sock));
4380         REQUIRE(region != NULL);
4381         REQUIRE(task != NULL);
4382         REQUIRE(action != NULL);
4383
4384         manager = sock->manager;
4385         REQUIRE(VALID_MANAGER(manager));
4386
4387         INSIST(sock->bound);
4388
4389         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4390         if (dev == NULL) {
4391                 return (ISC_R_NOMEMORY);
4392         }
4393
4394         dev->region = *region;
4395
4396         return (socket_send(sock, dev, task, address, pktinfo, 0));
4397 }
4398
4399 isc_result_t
4400 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
4401                  isc_task_t *task, isc_taskaction_t action, const void *arg)
4402 {
4403         return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
4404                                    NULL));
4405 }
4406
4407 isc_result_t
4408 isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
4409                    isc_task_t *task, isc_taskaction_t action, const void *arg,
4410                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4411 {
4412         isc_socketevent_t *dev;
4413         isc_socketmgr_t *manager;
4414         unsigned int iocount;
4415         isc_buffer_t *buffer;
4416
4417         REQUIRE(VALID_SOCKET(sock));
4418         REQUIRE(buflist != NULL);
4419         REQUIRE(!ISC_LIST_EMPTY(*buflist));
4420         REQUIRE(task != NULL);
4421         REQUIRE(action != NULL);
4422
4423         manager = sock->manager;
4424         REQUIRE(VALID_MANAGER(manager));
4425
4426         iocount = isc_bufferlist_usedcount(buflist);
4427         REQUIRE(iocount > 0);
4428
4429         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4430         if (dev == NULL) {
4431                 return (ISC_R_NOMEMORY);
4432         }
4433
4434         /*
4435          * Move each buffer from the passed in list to our internal one.
4436          */
4437         buffer = ISC_LIST_HEAD(*buflist);
4438         while (buffer != NULL) {
4439                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
4440                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4441                 buffer = ISC_LIST_HEAD(*buflist);
4442         }
4443
4444         return (socket_send(sock, dev, task, address, pktinfo, 0));
4445 }
4446
4447 isc_result_t
4448 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
4449                    isc_task_t *task,
4450                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4451                    isc_socketevent_t *event, unsigned int flags)
4452 {
4453         REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
4454         if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
4455                 REQUIRE(sock->type == isc_sockettype_udp);
4456         event->ev_sender = sock;
4457         event->result = ISC_R_UNEXPECTED;
4458         ISC_LIST_INIT(event->bufferlist);
4459         event->region = *region;
4460         event->n = 0;
4461         event->offset = 0;
4462         event->attributes = 0;
4463
4464         return (socket_send(sock, event, task, address, pktinfo, flags));
4465 }
4466
4467 void
4468 isc_socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
4469 #ifdef ISC_PLATFORM_HAVESYSUNH
4470         int s;
4471         struct stat sb;
4472         char strbuf[ISC_STRERRORSIZE];
4473
4474         if (sockaddr->type.sa.sa_family != AF_UNIX)
4475                 return;
4476
4477 #ifndef S_ISSOCK
4478 #if defined(S_IFMT) && defined(S_IFSOCK)
4479 #define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
4480 #elif defined(_S_IFMT) && defined(S_IFSOCK)
4481 #define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
4482 #endif
4483 #endif
4484
4485 #ifndef S_ISFIFO
4486 #if defined(S_IFMT) && defined(S_IFIFO)
4487 #define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
4488 #elif defined(_S_IFMT) && defined(S_IFIFO)
4489 #define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
4490 #endif
4491 #endif
4492
4493 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4494 #error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4495 #endif
4496
4497 #ifndef S_ISFIFO
4498 #define S_ISFIFO(mode) 0
4499 #endif
4500
4501 #ifndef S_ISSOCK
4502 #define S_ISSOCK(mode) 0
4503 #endif
4504
4505         if (active) {
4506                 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4507                         isc__strerror(errno, strbuf, sizeof(strbuf));
4508                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4509                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4510                                       "isc_socket_cleanunix: stat(%s): %s",
4511                                       sockaddr->type.sunix.sun_path, strbuf);
4512                         return;
4513                 }
4514                 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4515                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4516                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4517                                       "isc_socket_cleanunix: %s: not a socket",
4518                                       sockaddr->type.sunix.sun_path);
4519                         return;
4520                 }
4521                 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4522                         isc__strerror(errno, strbuf, sizeof(strbuf));
4523                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4524                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4525                                       "isc_socket_cleanunix: unlink(%s): %s",
4526                                       sockaddr->type.sunix.sun_path, strbuf);
4527                 }
4528                 return;
4529         }
4530
4531         s = socket(AF_UNIX, SOCK_STREAM, 0);
4532         if (s < 0) {
4533                 isc__strerror(errno, strbuf, sizeof(strbuf));
4534                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4535                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4536                               "isc_socket_cleanunix: socket(%s): %s",
4537                               sockaddr->type.sunix.sun_path, strbuf);
4538                 return;
4539         }
4540
4541         if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4542                 switch (errno) {
4543                 case ENOENT:    /* We exited cleanly last time */
4544                         break;
4545                 default:
4546                         isc__strerror(errno, strbuf, sizeof(strbuf));
4547                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4548                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4549                                       "isc_socket_cleanunix: stat(%s): %s",
4550                                       sockaddr->type.sunix.sun_path, strbuf);
4551                         break;
4552                 }
4553                 goto cleanup;
4554         }
4555
4556         if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4557                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4558                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4559                               "isc_socket_cleanunix: %s: not a socket",
4560                               sockaddr->type.sunix.sun_path);
4561                 goto cleanup;
4562         }
4563
4564         if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
4565                     sizeof(sockaddr->type.sunix)) < 0) {
4566                 switch (errno) {
4567                 case ECONNREFUSED:
4568                 case ECONNRESET:
4569                         if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4570                                 isc__strerror(errno, strbuf, sizeof(strbuf));
4571                                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4572                                               ISC_LOGMODULE_SOCKET,
4573                                               ISC_LOG_WARNING,
4574                                               "isc_socket_cleanunix: "
4575                                               "unlink(%s): %s",
4576                                               sockaddr->type.sunix.sun_path,
4577                                               strbuf);
4578                         }
4579                         break;
4580                 default:
4581                         isc__strerror(errno, strbuf, sizeof(strbuf));
4582                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4583                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4584                                       "isc_socket_cleanunix: connect(%s): %s",
4585                                       sockaddr->type.sunix.sun_path, strbuf);
4586                         break;
4587                 }
4588         }
4589  cleanup:
4590         close(s);
4591 #else
4592         UNUSED(sockaddr);
4593         UNUSED(active);
4594 #endif
4595 }
4596
4597 isc_result_t
4598 isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
4599                     isc_uint32_t owner, isc_uint32_t group)
4600 {
4601 #ifdef ISC_PLATFORM_HAVESYSUNH
4602         isc_result_t result = ISC_R_SUCCESS;
4603         char strbuf[ISC_STRERRORSIZE];
4604         char path[sizeof(sockaddr->type.sunix.sun_path)];
4605 #ifdef NEED_SECURE_DIRECTORY
4606         char *slash;
4607 #endif
4608
4609         REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4610         INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4611         strcpy(path, sockaddr->type.sunix.sun_path);
4612
4613 #ifdef NEED_SECURE_DIRECTORY
4614         slash = strrchr(path, '/');
4615         if (slash != NULL) {
4616                 if (slash != path)
4617                         *slash = '\0';
4618                 else
4619                         strcpy(path, "/");
4620         } else
4621                 strcpy(path, ".");
4622 #endif
4623
4624         if (chmod(path, perm) < 0) {
4625                 isc__strerror(errno, strbuf, sizeof(strbuf));
4626                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4627                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4628                               "isc_socket_permunix: chmod(%s, %d): %s",
4629                               path, perm, strbuf);
4630                 result = ISC_R_FAILURE;
4631         }
4632         if (chown(path, owner, group) < 0) {
4633                 isc__strerror(errno, strbuf, sizeof(strbuf));
4634                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4635                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4636                               "isc_socket_permunix: chown(%s, %d, %d): %s",
4637                               path, owner, group,
4638                               strbuf);
4639                 result = ISC_R_FAILURE;
4640         }
4641         return (result);
4642 #else
4643         UNUSED(sockaddr);
4644         UNUSED(perm);
4645         UNUSED(owner);
4646         UNUSED(group);
4647         return (ISC_R_NOTIMPLEMENTED);
4648 #endif
4649 }
4650
4651 isc_result_t
4652 isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
4653                 unsigned int options) {
4654         char strbuf[ISC_STRERRORSIZE];
4655         int on = 1;
4656
4657         LOCK(&sock->lock);
4658
4659         INSIST(!sock->bound);
4660
4661         if (sock->pf != sockaddr->type.sa.sa_family) {
4662                 UNLOCK(&sock->lock);
4663                 return (ISC_R_FAMILYMISMATCH);
4664         }
4665         /*
4666          * Only set SO_REUSEADDR when we want a specific port.
4667          */
4668 #ifdef AF_UNIX
4669         if (sock->pf == AF_UNIX)
4670                 goto bind_socket;
4671 #endif
4672         if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4673             isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
4674             setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4675                        sizeof(on)) < 0) {
4676                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4677                                  "setsockopt(%d) %s", sock->fd,
4678                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4679                                                 ISC_MSG_FAILED, "failed"));
4680                 /* Press on... */
4681         }
4682 #ifdef AF_UNIX
4683  bind_socket:
4684 #endif
4685         if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
4686                 inc_stats(sock->manager->stats,
4687                           sock->statsindex[STATID_BINDFAIL]);
4688
4689                 UNLOCK(&sock->lock);
4690                 switch (errno) {
4691                 case EACCES:
4692                         return (ISC_R_NOPERM);
4693                 case EADDRNOTAVAIL:
4694                         return (ISC_R_ADDRNOTAVAIL);
4695                 case EADDRINUSE:
4696                         return (ISC_R_ADDRINUSE);
4697                 case EINVAL:
4698                         return (ISC_R_BOUND);
4699                 default:
4700                         isc__strerror(errno, strbuf, sizeof(strbuf));
4701                         UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
4702                                          strbuf);
4703                         return (ISC_R_UNEXPECTED);
4704                 }
4705         }
4706
4707         socket_log(sock, sockaddr, TRACE,
4708                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
4709         sock->bound = 1;
4710
4711         UNLOCK(&sock->lock);
4712         return (ISC_R_SUCCESS);
4713 }
4714
4715 isc_result_t
4716 isc_socket_filter(isc_socket_t *sock, const char *filter) {
4717 #ifdef SO_ACCEPTFILTER
4718         char strbuf[ISC_STRERRORSIZE];
4719         struct accept_filter_arg afa;
4720 #else
4721         UNUSED(sock);
4722         UNUSED(filter);
4723 #endif
4724
4725         REQUIRE(VALID_SOCKET(sock));
4726
4727 #ifdef SO_ACCEPTFILTER
4728         bzero(&afa, sizeof(afa));
4729         strncpy(afa.af_name, filter, sizeof(afa.af_name));
4730         if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
4731                          &afa, sizeof(afa)) == -1) {
4732                 isc__strerror(errno, strbuf, sizeof(strbuf));
4733                 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
4734                            ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
4735                            strbuf);
4736                 return (ISC_R_FAILURE);
4737         }
4738         return (ISC_R_SUCCESS);
4739 #else
4740         return (ISC_R_NOTIMPLEMENTED);
4741 #endif
4742 }
4743
4744 /*
4745  * Set up to listen on a given socket.  We do this by creating an internal
4746  * event that will be dispatched when the socket has read activity.  The
4747  * watcher will send the internal event to the task when there is a new
4748  * connection.
4749  *
4750  * Unlike in read, we don't preallocate a done event here.  Every time there
4751  * is a new connection we'll have to allocate a new one anyway, so we might
4752  * as well keep things simple rather than having to track them.
4753  */
4754 isc_result_t
4755 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
4756         char strbuf[ISC_STRERRORSIZE];
4757
4758         REQUIRE(VALID_SOCKET(sock));
4759
4760         LOCK(&sock->lock);
4761
4762         REQUIRE(!sock->listener);
4763         REQUIRE(sock->bound);
4764         REQUIRE(sock->type == isc_sockettype_tcp ||
4765                 sock->type == isc_sockettype_unix);
4766
4767         if (backlog == 0)
4768                 backlog = SOMAXCONN;
4769
4770         if (listen(sock->fd, (int)backlog) < 0) {
4771                 UNLOCK(&sock->lock);
4772                 isc__strerror(errno, strbuf, sizeof(strbuf));
4773
4774                 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
4775
4776                 return (ISC_R_UNEXPECTED);
4777         }
4778
4779         sock->listener = 1;
4780
4781         UNLOCK(&sock->lock);
4782         return (ISC_R_SUCCESS);
4783 }
4784
4785 /*
4786  * This should try to do aggressive accept() XXXMLG
4787  */
4788 isc_result_t
4789 isc_socket_accept(isc_socket_t *sock,
4790                   isc_task_t *task, isc_taskaction_t action, const void *arg)
4791 {
4792         isc_socket_newconnev_t *dev;
4793         isc_socketmgr_t *manager;
4794         isc_task_t *ntask = NULL;
4795         isc_socket_t *nsock;
4796         isc_result_t result;
4797         isc_boolean_t do_poke = ISC_FALSE;
4798
4799         REQUIRE(VALID_SOCKET(sock));
4800         manager = sock->manager;
4801         REQUIRE(VALID_MANAGER(manager));
4802
4803         LOCK(&sock->lock);
4804
4805         REQUIRE(sock->listener);
4806
4807         /*
4808          * Sender field is overloaded here with the task we will be sending
4809          * this event to.  Just before the actual event is delivered the
4810          * actual ev_sender will be touched up to be the socket.
4811          */
4812         dev = (isc_socket_newconnev_t *)
4813                 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
4814                                    action, arg, sizeof(*dev));
4815         if (dev == NULL) {
4816                 UNLOCK(&sock->lock);
4817                 return (ISC_R_NOMEMORY);
4818         }
4819         ISC_LINK_INIT(dev, ev_link);
4820
4821         result = allocate_socket(manager, sock->type, &nsock);
4822         if (result != ISC_R_SUCCESS) {
4823                 isc_event_free(ISC_EVENT_PTR(&dev));
4824                 UNLOCK(&sock->lock);
4825                 return (result);
4826         }
4827
4828         /*
4829          * Attach to socket and to task.
4830          */
4831         isc_task_attach(task, &ntask);
4832         nsock->references++;
4833         nsock->statsindex = sock->statsindex;
4834
4835         dev->ev_sender = ntask;
4836         dev->newsocket = nsock;
4837
4838         /*
4839          * Poke watcher here.  We still have the socket locked, so there
4840          * is no race condition.  We will keep the lock for such a short
4841          * bit of time waking it up now or later won't matter all that much.
4842          */
4843         if (ISC_LIST_EMPTY(sock->accept_list))
4844                 do_poke = ISC_TRUE;
4845
4846         ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
4847
4848         if (do_poke)
4849                 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
4850
4851         UNLOCK(&sock->lock);
4852         return (ISC_R_SUCCESS);
4853 }
4854
4855 isc_result_t
4856 isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
4857                    isc_task_t *task, isc_taskaction_t action, const void *arg)
4858 {
4859         isc_socket_connev_t *dev;
4860         isc_task_t *ntask = NULL;
4861         isc_socketmgr_t *manager;
4862         int cc;
4863         char strbuf[ISC_STRERRORSIZE];
4864         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
4865
4866         REQUIRE(VALID_SOCKET(sock));
4867         REQUIRE(addr != NULL);
4868         REQUIRE(task != NULL);
4869         REQUIRE(action != NULL);
4870
4871         manager = sock->manager;
4872         REQUIRE(VALID_MANAGER(manager));
4873         REQUIRE(addr != NULL);
4874
4875         if (isc_sockaddr_ismulticast(addr))
4876                 return (ISC_R_MULTICAST);
4877
4878         LOCK(&sock->lock);
4879
4880         REQUIRE(!sock->connecting);
4881
4882         dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
4883                                                         ISC_SOCKEVENT_CONNECT,
4884                                                         action, arg,
4885                                                         sizeof(*dev));
4886         if (dev == NULL) {
4887                 UNLOCK(&sock->lock);
4888                 return (ISC_R_NOMEMORY);
4889         }
4890         ISC_LINK_INIT(dev, ev_link);
4891
4892         /*
4893          * Try to do the connect right away, as there can be only one
4894          * outstanding, and it might happen to complete.
4895          */
4896         sock->peer_address = *addr;
4897         cc = connect(sock->fd, &addr->type.sa, addr->length);
4898         if (cc < 0) {
4899                 /*
4900                  * HP-UX "fails" to connect a UDP socket and sets errno to
4901                  * EINPROGRESS if it's non-blocking.  We'd rather regard this as
4902                  * a success and let the user detect it if it's really an error
4903                  * at the time of sending a packet on the socket.
4904                  */
4905                 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
4906                         cc = 0;
4907                         goto success;
4908                 }
4909                 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
4910                         goto queue;
4911
4912                 switch (errno) {
4913 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
4914                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
4915                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4916                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4917                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4918                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4919 #ifdef EHOSTDOWN
4920                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4921 #endif
4922                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4923                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4924                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4925                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4926                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4927 #undef ERROR_MATCH
4928                 }
4929
4930                 sock->connected = 0;
4931
4932                 isc__strerror(errno, strbuf, sizeof(strbuf));
4933                 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
4934                 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
4935                                  addrbuf, errno, strbuf);
4936
4937                 UNLOCK(&sock->lock);
4938                 inc_stats(sock->manager->stats,
4939                           sock->statsindex[STATID_CONNECTFAIL]);
4940                 isc_event_free(ISC_EVENT_PTR(&dev));
4941                 return (ISC_R_UNEXPECTED);
4942
4943         err_exit:
4944                 sock->connected = 0;
4945                 isc_task_send(task, ISC_EVENT_PTR(&dev));
4946
4947                 UNLOCK(&sock->lock);
4948                 inc_stats(sock->manager->stats,
4949                           sock->statsindex[STATID_CONNECTFAIL]);
4950                 return (ISC_R_SUCCESS);
4951         }
4952
4953         /*
4954          * If connect completed, fire off the done event.
4955          */
4956  success:
4957         if (cc == 0) {
4958                 sock->connected = 1;
4959                 sock->bound = 1;
4960                 dev->result = ISC_R_SUCCESS;
4961                 isc_task_send(task, ISC_EVENT_PTR(&dev));
4962
4963                 UNLOCK(&sock->lock);
4964
4965                 inc_stats(sock->manager->stats,
4966                           sock->statsindex[STATID_CONNECT]);
4967
4968                 return (ISC_R_SUCCESS);
4969         }
4970
4971  queue:
4972
4973         /*
4974          * Attach to task.
4975          */
4976         isc_task_attach(task, &ntask);
4977
4978         sock->connecting = 1;
4979
4980         dev->ev_sender = ntask;
4981
4982         /*
4983          * Poke watcher here.  We still have the socket locked, so there
4984          * is no race condition.  We will keep the lock for such a short
4985          * bit of time waking it up now or later won't matter all that much.
4986          */
4987         if (sock->connect_ev == NULL)
4988                 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
4989
4990         sock->connect_ev = dev;
4991
4992         UNLOCK(&sock->lock);
4993         return (ISC_R_SUCCESS);
4994 }
4995
4996 /*
4997  * Called when a socket with a pending connect() finishes.
4998  */
4999 static void
5000 internal_connect(isc_task_t *me, isc_event_t *ev) {
5001         isc_socket_t *sock;
5002         isc_socket_connev_t *dev;
5003         isc_task_t *task;
5004         int cc;
5005         ISC_SOCKADDR_LEN_T optlen;
5006         char strbuf[ISC_STRERRORSIZE];
5007         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5008
5009         UNUSED(me);
5010         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
5011
5012         sock = ev->ev_sender;
5013         INSIST(VALID_SOCKET(sock));
5014
5015         LOCK(&sock->lock);
5016
5017         /*
5018          * When the internal event was sent the reference count was bumped
5019          * to keep the socket around for us.  Decrement the count here.
5020          */
5021         INSIST(sock->references > 0);
5022         sock->references--;
5023         if (sock->references == 0) {
5024                 UNLOCK(&sock->lock);
5025                 destroy(&sock);
5026                 return;
5027         }
5028
5029         /*
5030          * Has this event been canceled?
5031          */
5032         dev = sock->connect_ev;
5033         if (dev == NULL) {
5034                 INSIST(!sock->connecting);
5035                 UNLOCK(&sock->lock);
5036                 return;
5037         }
5038
5039         INSIST(sock->connecting);
5040         sock->connecting = 0;
5041
5042         /*
5043          * Get any possible error status here.
5044          */
5045         optlen = sizeof(cc);
5046         if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
5047                        (void *)&cc, (void *)&optlen) < 0)
5048                 cc = errno;
5049         else
5050                 errno = cc;
5051
5052         if (errno != 0) {
5053                 /*
5054                  * If the error is EAGAIN, just re-select on this
5055                  * fd and pretend nothing strange happened.
5056                  */
5057                 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
5058                         sock->connecting = 1;
5059                         select_poke(sock->manager, sock->fd,
5060                                     SELECT_POKE_CONNECT);
5061                         UNLOCK(&sock->lock);
5062
5063                         return;
5064                 }
5065
5066                 inc_stats(sock->manager->stats,
5067                           sock->statsindex[STATID_CONNECTFAIL]);
5068
5069                 /*
5070                  * Translate other errors into ISC_R_* flavors.
5071                  */
5072                 switch (errno) {
5073 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
5074                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
5075                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
5076                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
5077                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
5078                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
5079 #ifdef EHOSTDOWN
5080                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
5081 #endif
5082                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
5083                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
5084                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
5085                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
5086                         ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
5087                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5088 #undef ERROR_MATCH
5089                 default:
5090                         dev->result = ISC_R_UNEXPECTED;
5091                         isc_sockaddr_format(&sock->peer_address, peerbuf,
5092                                             sizeof(peerbuf));
5093                         isc__strerror(errno, strbuf, sizeof(strbuf));
5094                         UNEXPECTED_ERROR(__FILE__, __LINE__,
5095                                          "internal_connect: connect(%s) %s",
5096                                          peerbuf, strbuf);
5097                 }
5098         } else {
5099                 inc_stats(sock->manager->stats,
5100                           sock->statsindex[STATID_CONNECT]);
5101                 dev->result = ISC_R_SUCCESS;
5102                 sock->connected = 1;
5103                 sock->bound = 1;
5104         }
5105
5106         sock->connect_ev = NULL;
5107
5108         UNLOCK(&sock->lock);
5109
5110         task = dev->ev_sender;
5111         dev->ev_sender = sock;
5112         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
5113 }
5114
5115 isc_result_t
5116 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5117         isc_result_t result;
5118
5119         REQUIRE(VALID_SOCKET(sock));
5120         REQUIRE(addressp != NULL);
5121
5122         LOCK(&sock->lock);
5123
5124         if (sock->connected) {
5125                 *addressp = sock->peer_address;
5126                 result = ISC_R_SUCCESS;
5127         } else {
5128                 result = ISC_R_NOTCONNECTED;
5129         }
5130
5131         UNLOCK(&sock->lock);
5132
5133         return (result);
5134 }
5135
5136 isc_result_t
5137 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5138         ISC_SOCKADDR_LEN_T len;
5139         isc_result_t result;
5140         char strbuf[ISC_STRERRORSIZE];
5141
5142         REQUIRE(VALID_SOCKET(sock));
5143         REQUIRE(addressp != NULL);
5144
5145         LOCK(&sock->lock);
5146
5147         if (!sock->bound) {
5148                 result = ISC_R_NOTBOUND;
5149                 goto out;
5150         }
5151
5152         result = ISC_R_SUCCESS;
5153
5154         len = sizeof(addressp->type);
5155         if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
5156                 isc__strerror(errno, strbuf, sizeof(strbuf));
5157                 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
5158                                  strbuf);
5159                 result = ISC_R_UNEXPECTED;
5160                 goto out;
5161         }
5162         addressp->length = (unsigned int)len;
5163
5164  out:
5165         UNLOCK(&sock->lock);
5166
5167         return (result);
5168 }
5169
5170 /*
5171  * Run through the list of events on this socket, and cancel the ones
5172  * queued for task "task" of type "how".  "how" is a bitmask.
5173  */
5174 void
5175 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
5176
5177         REQUIRE(VALID_SOCKET(sock));
5178
5179         /*
5180          * Quick exit if there is nothing to do.  Don't even bother locking
5181          * in this case.
5182          */
5183         if (how == 0)
5184                 return;
5185
5186         LOCK(&sock->lock);
5187
5188         /*
5189          * All of these do the same thing, more or less.
5190          * Each will:
5191          *      o If the internal event is marked as "posted" try to
5192          *        remove it from the task's queue.  If this fails, mark it
5193          *        as canceled instead, and let the task clean it up later.
5194          *      o For each I/O request for that task of that type, post
5195          *        its done event with status of "ISC_R_CANCELED".
5196          *      o Reset any state needed.
5197          */
5198         if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
5199             && !ISC_LIST_EMPTY(sock->recv_list)) {
5200                 isc_socketevent_t      *dev;
5201                 isc_socketevent_t      *next;
5202                 isc_task_t             *current_task;
5203
5204                 dev = ISC_LIST_HEAD(sock->recv_list);
5205
5206                 while (dev != NULL) {
5207                         current_task = dev->ev_sender;
5208                         next = ISC_LIST_NEXT(dev, ev_link);
5209
5210                         if ((task == NULL) || (task == current_task)) {
5211                                 dev->result = ISC_R_CANCELED;
5212                                 send_recvdone_event(sock, &dev);
5213                         }
5214                         dev = next;
5215                 }
5216         }
5217
5218         if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
5219             && !ISC_LIST_EMPTY(sock->send_list)) {
5220                 isc_socketevent_t      *dev;
5221                 isc_socketevent_t      *next;
5222                 isc_task_t             *current_task;
5223
5224                 dev = ISC_LIST_HEAD(sock->send_list);
5225
5226                 while (dev != NULL) {
5227                         current_task = dev->ev_sender;
5228                         next = ISC_LIST_NEXT(dev, ev_link);
5229
5230                         if ((task == NULL) || (task == current_task)) {
5231                                 dev->result = ISC_R_CANCELED;
5232                                 send_senddone_event(sock, &dev);
5233                         }
5234                         dev = next;
5235                 }
5236         }
5237
5238         if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
5239             && !ISC_LIST_EMPTY(sock->accept_list)) {
5240                 isc_socket_newconnev_t *dev;
5241                 isc_socket_newconnev_t *next;
5242                 isc_task_t             *current_task;
5243
5244                 dev = ISC_LIST_HEAD(sock->accept_list);
5245                 while (dev != NULL) {
5246                         current_task = dev->ev_sender;
5247                         next = ISC_LIST_NEXT(dev, ev_link);
5248
5249                         if ((task == NULL) || (task == current_task)) {
5250
5251                                 ISC_LIST_UNLINK(sock->accept_list, dev,
5252                                                 ev_link);
5253
5254                                 dev->newsocket->references--;
5255                                 free_socket(&dev->newsocket);
5256
5257                                 dev->result = ISC_R_CANCELED;
5258                                 dev->ev_sender = sock;
5259                                 isc_task_sendanddetach(&current_task,
5260                                                        ISC_EVENT_PTR(&dev));
5261                         }
5262
5263                         dev = next;
5264                 }
5265         }
5266
5267         /*
5268          * Connecting is not a list.
5269          */
5270         if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
5271             && sock->connect_ev != NULL) {
5272                 isc_socket_connev_t    *dev;
5273                 isc_task_t             *current_task;
5274
5275                 INSIST(sock->connecting);
5276                 sock->connecting = 0;
5277
5278                 dev = sock->connect_ev;
5279                 current_task = dev->ev_sender;
5280
5281                 if ((task == NULL) || (task == current_task)) {
5282                         sock->connect_ev = NULL;
5283
5284                         dev->result = ISC_R_CANCELED;
5285                         dev->ev_sender = sock;
5286                         isc_task_sendanddetach(&current_task,
5287                                                ISC_EVENT_PTR(&dev));
5288                 }
5289         }
5290
5291         UNLOCK(&sock->lock);
5292 }
5293
5294 isc_sockettype_t
5295 isc_socket_gettype(isc_socket_t *sock) {
5296         REQUIRE(VALID_SOCKET(sock));
5297
5298         return (sock->type);
5299 }
5300
5301 isc_boolean_t
5302 isc_socket_isbound(isc_socket_t *sock) {
5303         isc_boolean_t val;
5304
5305         LOCK(&sock->lock);
5306         val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
5307         UNLOCK(&sock->lock);
5308
5309         return (val);
5310 }
5311
5312 void
5313 isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
5314 #if defined(IPV6_V6ONLY)
5315         int onoff = yes ? 1 : 0;
5316 #else
5317         UNUSED(yes);
5318         UNUSED(sock);
5319 #endif
5320
5321         REQUIRE(VALID_SOCKET(sock));
5322
5323 #ifdef IPV6_V6ONLY
5324         if (sock->pf == AF_INET6) {
5325                 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5326                                (void *)&onoff, sizeof(int)) < 0) {
5327                         char strbuf[ISC_STRERRORSIZE];
5328
5329                         UNEXPECTED_ERROR(__FILE__, __LINE__,
5330                                          "setsockopt(%d, IPV6_V6ONLY) "
5331                                          "%s: %s", sock->fd,
5332                                          isc_msgcat_get(isc_msgcat,
5333                                                         ISC_MSGSET_GENERAL,
5334                                                         ISC_MSG_FAILED,
5335                                                         "failed"),
5336                                          strbuf);
5337                 }
5338         }
5339         FIX_IPV6_RECVPKTINFO(sock);     /* AIX */
5340 #endif
5341 }
5342
5343 #ifndef ISC_PLATFORM_USETHREADS
5344 /* In our assumed scenario, we can simply use a single static object. */
5345 static isc_socketwait_t swait_private;
5346
5347 int
5348 isc__socketmgr_waitevents(struct timeval *tvp, isc_socketwait_t **swaitp) {
5349         int n;
5350 #ifdef USE_KQUEUE
5351         struct timespec ts, *tsp;
5352 #endif
5353 #ifdef USE_EPOLL
5354         int timeout;
5355 #endif
5356 #ifdef USE_DEVPOLL
5357         struct dvpoll dvp;
5358 #endif
5359
5360         REQUIRE(swaitp != NULL && *swaitp == NULL);
5361
5362         if (socketmgr == NULL)
5363                 return (0);
5364
5365 #ifdef USE_KQUEUE
5366         if (tvp != NULL) {
5367                 ts.tv_sec = tvp->tv_sec;
5368                 ts.tv_nsec = tvp->tv_usec * 1000;
5369                 tsp = &ts;
5370         } else
5371                 tsp = NULL;
5372         swait_private.nevents = kevent(socketmgr->kqueue_fd, NULL, 0,
5373                                        socketmgr->events, socketmgr->nevents,
5374                                        tsp);
5375         n = swait_private.nevents;
5376 #elif defined(USE_EPOLL)
5377         if (tvp != NULL)
5378                 timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000;
5379         else
5380                 timeout = -1;
5381         swait_private.nevents = epoll_wait(socketmgr->epoll_fd,
5382                                            socketmgr->events,
5383                                            socketmgr->nevents, timeout);
5384         n = swait_private.nevents;
5385 #elif defined(USE_DEVPOLL)
5386         dvp.dp_fds = socketmgr->events;
5387         dvp.dp_nfds = socketmgr->nevents;
5388         if (tvp != NULL) {
5389                 dvp.dp_timeout = tvp->tv_sec * 1000 +
5390                         (tvp->tv_usec + 999) / 1000;
5391         } else
5392                 dvp.dp_timeout = -1;
5393         swait_private.nevents = ioctl(socketmgr->devpoll_fd, DP_POLL, &dvp);
5394         n = swait_private.nevents;
5395 #elif defined(USE_SELECT)
5396         memcpy(socketmgr->read_fds_copy, socketmgr->read_fds,
5397                socketmgr->fd_bufsize);
5398         memcpy(socketmgr->write_fds_copy, socketmgr->write_fds,
5399                socketmgr->fd_bufsize);
5400
5401         swait_private.readset = socketmgr->read_fds_copy;
5402         swait_private.writeset = socketmgr->write_fds_copy;
5403         swait_private.maxfd = socketmgr->maxfd + 1;
5404
5405         n = select(swait_private.maxfd, swait_private.readset,
5406                    swait_private.writeset, NULL, tvp);
5407 #endif
5408
5409         *swaitp = &swait_private;
5410         return (n);
5411 }
5412
5413 isc_result_t
5414 isc__socketmgr_dispatch(isc_socketwait_t *swait) {
5415         REQUIRE(swait == &swait_private);
5416
5417         if (socketmgr == NULL)
5418                 return (ISC_R_NOTFOUND);
5419
5420 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
5421         (void)process_fds(socketmgr, socketmgr->events, swait->nevents);
5422         return (ISC_R_SUCCESS);
5423 #elif defined(USE_SELECT)
5424         process_fds(socketmgr, swait->maxfd, swait->readset, swait->writeset);
5425         return (ISC_R_SUCCESS);
5426 #endif
5427 }
5428 #endif /* ISC_PLATFORM_USETHREADS */
5429
5430 void
5431 isc_socket_setname(isc_socket_t *socket, const char *name, void *tag) {
5432
5433         /*
5434          * Name 'socket'.
5435          */
5436
5437         REQUIRE(VALID_SOCKET(socket));
5438
5439         LOCK(&socket->lock);
5440         memset(socket->name, 0, sizeof(socket->name));
5441         strncpy(socket->name, name, sizeof(socket->name) - 1);
5442         socket->tag = tag;
5443         UNLOCK(&socket->lock);
5444 }
5445
5446 const char *
5447 isc_socket_getname(isc_socket_t *socket) {
5448         return (socket->name);
5449 }
5450
5451 void *
5452 isc_socket_gettag(isc_socket_t *socket) {
5453         return (socket->tag);
5454 }
5455
5456 #ifdef HAVE_LIBXML2
5457
5458 static const char *
5459 _socktype(isc_sockettype_t type)
5460 {
5461         if (type == isc_sockettype_udp)
5462                 return ("udp");
5463         else if (type == isc_sockettype_tcp)
5464                 return ("tcp");
5465         else if (type == isc_sockettype_unix)
5466                 return ("unix");
5467         else if (type == isc_sockettype_fdwatch)
5468                 return ("fdwatch");
5469         else
5470                 return ("not-initialized");
5471 }
5472
5473 void
5474 isc_socketmgr_renderxml(isc_socketmgr_t *mgr, xmlTextWriterPtr writer)
5475 {
5476         isc_socket_t *sock;
5477         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5478         isc_sockaddr_t addr;
5479         ISC_SOCKADDR_LEN_T len;
5480
5481         LOCK(&mgr->lock);
5482
5483 #ifndef ISC_PLATFORM_USETHREADS
5484         xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5485         xmlTextWriterWriteFormatString(writer, "%d", mgr->refs);
5486         xmlTextWriterEndElement(writer);
5487 #endif
5488
5489         xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets");
5490         sock = ISC_LIST_HEAD(mgr->socklist);
5491         while (sock != NULL) {
5492                 LOCK(&sock->lock);
5493                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket");
5494
5495                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "id");
5496                 xmlTextWriterWriteFormatString(writer, "%p", sock);
5497                 xmlTextWriterEndElement(writer);
5498
5499                 if (sock->name[0] != 0) {
5500                         xmlTextWriterStartElement(writer, ISC_XMLCHAR "name");
5501                         xmlTextWriterWriteFormatString(writer, "%s",
5502                                                        sock->name);
5503                         xmlTextWriterEndElement(writer); /* name */
5504                 }
5505
5506                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5507                 xmlTextWriterWriteFormatString(writer, "%d", sock->references);
5508                 xmlTextWriterEndElement(writer);
5509
5510                 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
5511                                           ISC_XMLCHAR _socktype(sock->type));
5512
5513                 if (sock->connected) {
5514                         isc_sockaddr_format(&sock->peer_address, peerbuf,
5515                                             sizeof(peerbuf));
5516                         xmlTextWriterWriteElement(writer,
5517                                                   ISC_XMLCHAR "peer-address",
5518                                                   ISC_XMLCHAR peerbuf);
5519                 }
5520
5521                 len = sizeof(addr);
5522                 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5523                         isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5524                         xmlTextWriterWriteElement(writer,
5525                                                   ISC_XMLCHAR "local-address",
5526                                                   ISC_XMLCHAR peerbuf);
5527                 }
5528
5529                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "states");
5530                 if (sock->pending_recv)
5531                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5532                                                 ISC_XMLCHAR "pending-receive");
5533                 if (sock->pending_send)
5534                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5535                                                   ISC_XMLCHAR "pending-send");
5536                 if (sock->pending_accept)
5537                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5538                                                  ISC_XMLCHAR "pending_accept");
5539                 if (sock->listener)
5540                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5541                                                   ISC_XMLCHAR "listener");
5542                 if (sock->connected)
5543                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5544                                                   ISC_XMLCHAR "connected");
5545                 if (sock->connecting)
5546                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5547                                                   ISC_XMLCHAR "connecting");
5548                 if (sock->bound)
5549                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5550                                                   ISC_XMLCHAR "bound");
5551
5552                 xmlTextWriterEndElement(writer); /* states */
5553
5554                 xmlTextWriterEndElement(writer); /* socket */
5555
5556                 UNLOCK(&sock->lock);
5557                 sock = ISC_LIST_NEXT(sock, link);
5558         }
5559         xmlTextWriterEndElement(writer); /* sockets */
5560
5561         UNLOCK(&mgr->lock);
5562 }
5563 #endif /* HAVE_LIBXML2 */