]> CyberLeo.Net >> Repos - FreeBSD/stable/8.git/blob - contrib/bind9/lib/isc/unix/socket.c
Update to version 9.6-ESV-R6, the latest from ISC, which contains numerous
[FreeBSD/stable/8.git] / contrib / bind9 / lib / isc / unix / socket.c
1 /*
2  * Copyright (C) 2004-2012  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1998-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id$ */
19
20 /*! \file */
21
22 #include <config.h>
23
24 #include <sys/param.h>
25 #include <sys/types.h>
26 #include <sys/socket.h>
27 #include <sys/stat.h>
28 #include <sys/time.h>
29 #include <sys/uio.h>
30
31 #include <errno.h>
32 #include <fcntl.h>
33 #include <stddef.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37
38 #include <isc/buffer.h>
39 #include <isc/bufferlist.h>
40 #include <isc/condition.h>
41 #include <isc/formatcheck.h>
42 #include <isc/list.h>
43 #include <isc/log.h>
44 #include <isc/mem.h>
45 #include <isc/msgs.h>
46 #include <isc/mutex.h>
47 #include <isc/net.h>
48 #include <isc/once.h>
49 #include <isc/platform.h>
50 #include <isc/print.h>
51 #include <isc/region.h>
52 #include <isc/socket.h>
53 #include <isc/stats.h>
54 #include <isc/strerror.h>
55 #include <isc/task.h>
56 #include <isc/thread.h>
57 #include <isc/util.h>
58 #include <isc/xml.h>
59
60 #ifdef ISC_PLATFORM_HAVESYSUNH
61 #include <sys/un.h>
62 #endif
63 #ifdef ISC_PLATFORM_HAVEKQUEUE
64 #include <sys/event.h>
65 #endif
66 #ifdef ISC_PLATFORM_HAVEEPOLL
67 #include <sys/epoll.h>
68 #endif
69 #ifdef ISC_PLATFORM_HAVEDEVPOLL
70 #if defined(HAVE_SYS_DEVPOLL_H)
71 #include <sys/devpoll.h>
72 #elif defined(HAVE_DEVPOLL_H)
73 #include <devpoll.h>
74 #endif
75 #endif
76
77 #include "errno2result.h"
78
79 #ifndef ISC_PLATFORM_USETHREADS
80 #include "socket_p.h"
81 #endif /* ISC_PLATFORM_USETHREADS */
82
83 #if defined(SO_BSDCOMPAT) && defined(__linux__)
84 #include <sys/utsname.h>
85 #endif
86
87 /*%
88  * Choose the most preferable multiplex method.
89  */
90 #ifdef ISC_PLATFORM_HAVEKQUEUE
91 #define USE_KQUEUE
92 #elif defined (ISC_PLATFORM_HAVEEPOLL)
93 #define USE_EPOLL
94 #elif defined (ISC_PLATFORM_HAVEDEVPOLL)
95 #define USE_DEVPOLL
96 typedef struct {
97         unsigned int want_read : 1,
98                 want_write : 1;
99 } pollinfo_t;
100 #else
101 #define USE_SELECT
102 #endif  /* ISC_PLATFORM_HAVEKQUEUE */
103
104 #ifndef ISC_PLATFORM_USETHREADS
105 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
106 struct isc_socketwait {
107         int nevents;
108 };
109 #elif defined (USE_SELECT)
110 struct isc_socketwait {
111         fd_set *readset;
112         fd_set *writeset;
113         int nfds;
114         int maxfd;
115 };
116 #endif  /* USE_KQUEUE */
117 #endif /* !ISC_PLATFORM_USETHREADS */
118
119 /*%
120  * Maximum number of allowable open sockets.  This is also the maximum
121  * allowable socket file descriptor.
122  *
123  * Care should be taken before modifying this value for select():
124  * The API standard doesn't ensure select() accept more than (the system default
125  * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
126  * the vast majority of cases.  This constant should therefore be increased only
127  * when absolutely necessary and possible, i.e., the server is exhausting all
128  * available file descriptors (up to FD_SETSIZE) and the select() function
129  * and FD_xxx macros support larger values than FD_SETSIZE (which may not
130  * always by true, but we keep using some of them to ensure as much
131  * portability as possible).  Note also that overall server performance
132  * may be rather worsened with a larger value of this constant due to
133  * inherent scalability problems of select().
134  *
135  * As a special note, this value shouldn't have to be touched if
136  * this is a build for an authoritative only DNS server.
137  */
138 #ifndef ISC_SOCKET_MAXSOCKETS
139 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
140 #define ISC_SOCKET_MAXSOCKETS 4096
141 #elif defined(USE_SELECT)
142 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
143 #endif  /* USE_KQUEUE... */
144 #endif  /* ISC_SOCKET_MAXSOCKETS */
145
146 #ifdef USE_SELECT
147 /*%
148  * Mac OS X needs a special definition to support larger values in select().
149  * We always define this because a larger value can be specified run-time.
150  */
151 #ifdef __APPLE__
152 #define _DARWIN_UNLIMITED_SELECT
153 #endif  /* __APPLE__ */
154 #endif  /* USE_SELECT */
155
156 #ifdef ISC_SOCKET_USE_POLLWATCH
157 /*%
158  * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
159  * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
160  * some of the specified FD.  The idea is based on the observation that it's
161  * likely for a busy server to keep receiving packets.  It specifically works
162  * as follows: the socket watcher is first initialized with the state of
163  * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
164  * event occurs.  When it wakes up for a socket I/O event, it moves to the
165  * poll_active state, and sets the poll timeout to a short period
166  * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
167  * watcher goes to the poll_checking state with the same timeout period.
168  * In this state, the watcher tries to detect whether this is a break
169  * during intermittent events or the kernel bug is triggered.  If the next
170  * polling reports an event within the short period, the previous timeout is
171  * likely to be a kernel bug, and so the watcher goes back to the active state.
172  * Otherwise, it moves to the idle state again.
173  *
174  * It's not clear whether this is a thread-related bug, but since we've only
175  * seen this with threads, this workaround is used only when enabling threads.
176  */
177
178 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
179
180 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
181 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
182 #endif  /* ISC_SOCKET_POLLWATCH_TIMEOUT */
183 #endif  /* ISC_SOCKET_USE_POLLWATCH */
184
185 /*%
186  * Size of per-FD lock buckets.
187  */
188 #ifdef ISC_PLATFORM_USETHREADS
189 #define FDLOCK_COUNT            1024
190 #define FDLOCK_ID(fd)           ((fd) % FDLOCK_COUNT)
191 #else
192 #define FDLOCK_COUNT            1
193 #define FDLOCK_ID(fd)           0
194 #endif  /* ISC_PLATFORM_USETHREADS */
195
196 /*%
197  * Maximum number of events communicated with the kernel.  There should normally
198  * be no need for having a large number.
199  */
200 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
201 #ifndef ISC_SOCKET_MAXEVENTS
202 #define ISC_SOCKET_MAXEVENTS    64
203 #endif
204 #endif
205
206 /*%
207  * Some systems define the socket length argument as an int, some as size_t,
208  * some as socklen_t.  This is here so it can be easily changed if needed.
209  */
210 #ifndef ISC_SOCKADDR_LEN_T
211 #define ISC_SOCKADDR_LEN_T unsigned int
212 #endif
213
214 /*%
215  * Define what the possible "soft" errors can be.  These are non-fatal returns
216  * of various network related functions, like recv() and so on.
217  *
218  * For some reason, BSDI (and perhaps others) will sometimes return <0
219  * from recv() but will have errno==0.  This is broken, but we have to
220  * work around it here.
221  */
222 #define SOFT_ERROR(e)   ((e) == EAGAIN || \
223                          (e) == EWOULDBLOCK || \
224                          (e) == EINTR || \
225                          (e) == 0)
226
227 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
228
229 /*!<
230  * DLVL(90)  --  Function entry/exit and other tracing.
231  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
232  * DLVL(60)  --  Socket data send/receive
233  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
234  * DLVL(20)  --  Socket creation/destruction.
235  */
236 #define TRACE_LEVEL             90
237 #define CORRECTNESS_LEVEL       70
238 #define IOEVENT_LEVEL           60
239 #define EVENT_LEVEL             50
240 #define CREATION_LEVEL          20
241
242 #define TRACE           DLVL(TRACE_LEVEL)
243 #define CORRECTNESS     DLVL(CORRECTNESS_LEVEL)
244 #define IOEVENT         DLVL(IOEVENT_LEVEL)
245 #define EVENT           DLVL(EVENT_LEVEL)
246 #define CREATION        DLVL(CREATION_LEVEL)
247
248 typedef isc_event_t intev_t;
249
250 #define SOCKET_MAGIC            ISC_MAGIC('I', 'O', 'i', 'o')
251 #define VALID_SOCKET(t)         ISC_MAGIC_VALID(t, SOCKET_MAGIC)
252
253 /*!
254  * IPv6 control information.  If the socket is an IPv6 socket we want
255  * to collect the destination address and interface so the client can
256  * set them on outgoing packets.
257  */
258 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
259 #ifndef USE_CMSG
260 #define USE_CMSG        1
261 #endif
262 #endif
263
264 /*%
265  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
266  * a setsockopt() like interface to request timestamps, and if the OS
267  * doesn't do it for us, call gettimeofday() on every UDP receive?
268  */
269 #ifdef SO_TIMESTAMP
270 #ifndef USE_CMSG
271 #define USE_CMSG        1
272 #endif
273 #endif
274
275 /*%
276  * The size to raise the receive buffer to (from BIND 8).
277  */
278 #define RCVBUFSIZE (32*1024)
279
280 /*%
281  * The number of times a send operation is repeated if the result is EINTR.
282  */
283 #define NRETRIES 10
284
285 struct isc_socket {
286         /* Not locked. */
287         unsigned int            magic;
288         isc_socketmgr_t        *manager;
289         isc_mutex_t             lock;
290         isc_sockettype_t        type;
291         const isc_statscounter_t        *statsindex;
292
293         /* Locked by socket lock. */
294         ISC_LINK(isc_socket_t)  link;
295         unsigned int            references;
296         int                     fd;
297         int                     pf;
298         char                            name[16];
299         void *                          tag;
300
301         ISC_LIST(isc_socketevent_t)             send_list;
302         ISC_LIST(isc_socketevent_t)             recv_list;
303         ISC_LIST(isc_socket_newconnev_t)        accept_list;
304         isc_socket_connev_t                    *connect_ev;
305
306         /*
307          * Internal events.  Posted when a descriptor is readable or
308          * writable.  These are statically allocated and never freed.
309          * They will be set to non-purgable before use.
310          */
311         intev_t                 readable_ev;
312         intev_t                 writable_ev;
313
314         isc_sockaddr_t          peer_address;  /* remote address */
315
316         unsigned int            pending_recv : 1,
317                                 pending_send : 1,
318                                 pending_accept : 1,
319                                 listener : 1, /* listener socket */
320                                 connected : 1,
321                                 connecting : 1, /* connect pending */
322                                 bound : 1; /* bound to local addr */
323
324 #ifdef ISC_NET_RECVOVERFLOW
325         unsigned char           overflow; /* used for MSG_TRUNC fake */
326 #endif
327
328         char                    *recvcmsgbuf;
329         ISC_SOCKADDR_LEN_T      recvcmsgbuflen;
330         char                    *sendcmsgbuf;
331         ISC_SOCKADDR_LEN_T      sendcmsgbuflen;
332
333         void                    *fdwatcharg;
334         isc_sockfdwatch_t       fdwatchcb;
335         int                     fdwatchflags;
336         isc_task_t              *fdwatchtask;
337 };
338
339 #define SOCKET_MANAGER_MAGIC    ISC_MAGIC('I', 'O', 'm', 'g')
340 #define VALID_MANAGER(m)        ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
341
342 struct isc_socketmgr {
343         /* Not locked. */
344         unsigned int            magic;
345         isc_mem_t              *mctx;
346         isc_mutex_t             lock;
347         isc_mutex_t             *fdlock;
348         isc_stats_t             *stats;
349 #ifdef USE_KQUEUE
350         int                     kqueue_fd;
351         int                     nevents;
352         struct kevent           *events;
353 #endif  /* USE_KQUEUE */
354 #ifdef USE_EPOLL
355         int                     epoll_fd;
356         int                     nevents;
357         struct epoll_event      *events;
358 #endif  /* USE_EPOLL */
359 #ifdef USE_DEVPOLL
360         int                     devpoll_fd;
361         int                     nevents;
362         struct pollfd           *events;
363 #endif  /* USE_DEVPOLL */
364 #ifdef USE_SELECT
365         int                     fd_bufsize;
366 #endif  /* USE_SELECT */
367         unsigned int            maxsocks;
368 #ifdef ISC_PLATFORM_USETHREADS
369         int                     pipe_fds[2];
370 #endif
371
372         /* Locked by fdlock. */
373         isc_socket_t           **fds;
374         int                     *fdstate;
375 #ifdef USE_DEVPOLL
376         pollinfo_t              *fdpollinfo;
377 #endif
378
379         /* Locked by manager lock. */
380         ISC_LIST(isc_socket_t)  socklist;
381 #ifdef USE_SELECT
382         fd_set                  *read_fds;
383         fd_set                  *read_fds_copy;
384         fd_set                  *write_fds;
385         fd_set                  *write_fds_copy;
386         int                     maxfd;
387 #endif  /* USE_SELECT */
388         int                     reserved;       /* unlocked */
389 #ifdef ISC_PLATFORM_USETHREADS
390         isc_thread_t            watcher;
391         isc_condition_t         shutdown_ok;
392 #else /* ISC_PLATFORM_USETHREADS */
393         unsigned int            refs;
394 #endif /* ISC_PLATFORM_USETHREADS */
395 };
396
397 #ifndef ISC_PLATFORM_USETHREADS
398 static isc_socketmgr_t *socketmgr = NULL;
399 #endif /* ISC_PLATFORM_USETHREADS */
400
401 #define CLOSED                  0       /* this one must be zero */
402 #define MANAGED                 1
403 #define CLOSE_PENDING           2
404
405 /*
406  * send() and recv() iovec counts
407  */
408 #define MAXSCATTERGATHER_SEND   (ISC_SOCKET_MAXSCATTERGATHER)
409 #ifdef ISC_NET_RECVOVERFLOW
410 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER + 1)
411 #else
412 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER)
413 #endif
414
415 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
416 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
417 static void free_socket(isc_socket_t **);
418 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
419                                     isc_socket_t **);
420 static void destroy(isc_socket_t **);
421 static void internal_accept(isc_task_t *, isc_event_t *);
422 static void internal_connect(isc_task_t *, isc_event_t *);
423 static void internal_recv(isc_task_t *, isc_event_t *);
424 static void internal_send(isc_task_t *, isc_event_t *);
425 static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
426 static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
427 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
428 static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
429                               struct msghdr *, struct iovec *, size_t *);
430 static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
431                               struct msghdr *, struct iovec *, size_t *);
432 #ifdef ISC_PLATFORM_USETHREADS
433 static isc_boolean_t process_ctlfd(isc_socketmgr_t *manager);
434 #endif
435
436 #define SELECT_POKE_SHUTDOWN            (-1)
437 #define SELECT_POKE_NOTHING             (-2)
438 #define SELECT_POKE_READ                (-3)
439 #define SELECT_POKE_ACCEPT              (-3) /*%< Same as _READ */
440 #define SELECT_POKE_WRITE               (-4)
441 #define SELECT_POKE_CONNECT             (-4) /*%< Same as _WRITE */
442 #define SELECT_POKE_CLOSE               (-5)
443
444 #define SOCK_DEAD(s)                    ((s)->references == 0)
445
446 /*%
447  * Shortcut index arrays to get access to statistics counters.
448  */
449 enum {
450         STATID_OPEN = 0,
451         STATID_OPENFAIL = 1,
452         STATID_CLOSE = 2,
453         STATID_BINDFAIL = 3,
454         STATID_CONNECTFAIL = 4,
455         STATID_CONNECT = 5,
456         STATID_ACCEPTFAIL = 6,
457         STATID_ACCEPT = 7,
458         STATID_SENDFAIL = 8,
459         STATID_RECVFAIL = 9
460 };
461 static const isc_statscounter_t upd4statsindex[] = {
462         isc_sockstatscounter_udp4open,
463         isc_sockstatscounter_udp4openfail,
464         isc_sockstatscounter_udp4close,
465         isc_sockstatscounter_udp4bindfail,
466         isc_sockstatscounter_udp4connectfail,
467         isc_sockstatscounter_udp4connect,
468         -1,
469         -1,
470         isc_sockstatscounter_udp4sendfail,
471         isc_sockstatscounter_udp4recvfail
472 };
473 static const isc_statscounter_t upd6statsindex[] = {
474         isc_sockstatscounter_udp6open,
475         isc_sockstatscounter_udp6openfail,
476         isc_sockstatscounter_udp6close,
477         isc_sockstatscounter_udp6bindfail,
478         isc_sockstatscounter_udp6connectfail,
479         isc_sockstatscounter_udp6connect,
480         -1,
481         -1,
482         isc_sockstatscounter_udp6sendfail,
483         isc_sockstatscounter_udp6recvfail
484 };
485 static const isc_statscounter_t tcp4statsindex[] = {
486         isc_sockstatscounter_tcp4open,
487         isc_sockstatscounter_tcp4openfail,
488         isc_sockstatscounter_tcp4close,
489         isc_sockstatscounter_tcp4bindfail,
490         isc_sockstatscounter_tcp4connectfail,
491         isc_sockstatscounter_tcp4connect,
492         isc_sockstatscounter_tcp4acceptfail,
493         isc_sockstatscounter_tcp4accept,
494         isc_sockstatscounter_tcp4sendfail,
495         isc_sockstatscounter_tcp4recvfail
496 };
497 static const isc_statscounter_t tcp6statsindex[] = {
498         isc_sockstatscounter_tcp6open,
499         isc_sockstatscounter_tcp6openfail,
500         isc_sockstatscounter_tcp6close,
501         isc_sockstatscounter_tcp6bindfail,
502         isc_sockstatscounter_tcp6connectfail,
503         isc_sockstatscounter_tcp6connect,
504         isc_sockstatscounter_tcp6acceptfail,
505         isc_sockstatscounter_tcp6accept,
506         isc_sockstatscounter_tcp6sendfail,
507         isc_sockstatscounter_tcp6recvfail
508 };
509 static const isc_statscounter_t unixstatsindex[] = {
510         isc_sockstatscounter_unixopen,
511         isc_sockstatscounter_unixopenfail,
512         isc_sockstatscounter_unixclose,
513         isc_sockstatscounter_unixbindfail,
514         isc_sockstatscounter_unixconnectfail,
515         isc_sockstatscounter_unixconnect,
516         isc_sockstatscounter_unixacceptfail,
517         isc_sockstatscounter_unixaccept,
518         isc_sockstatscounter_unixsendfail,
519         isc_sockstatscounter_unixrecvfail
520 };
521 static const isc_statscounter_t fdwatchstatsindex[] = {
522         -1,
523         -1,
524         isc_sockstatscounter_fdwatchclose,
525         isc_sockstatscounter_fdwatchbindfail,
526         isc_sockstatscounter_fdwatchconnectfail,
527         isc_sockstatscounter_fdwatchconnect,
528         -1,
529         -1,
530         isc_sockstatscounter_fdwatchsendfail,
531         isc_sockstatscounter_fdwatchrecvfail
532 };
533
534 static void
535 manager_log(isc_socketmgr_t *sockmgr,
536             isc_logcategory_t *category, isc_logmodule_t *module, int level,
537             const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
538 static void
539 manager_log(isc_socketmgr_t *sockmgr,
540             isc_logcategory_t *category, isc_logmodule_t *module, int level,
541             const char *fmt, ...)
542 {
543         char msgbuf[2048];
544         va_list ap;
545
546         if (! isc_log_wouldlog(isc_lctx, level))
547                 return;
548
549         va_start(ap, fmt);
550         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
551         va_end(ap);
552
553         isc_log_write(isc_lctx, category, module, level,
554                       "sockmgr %p: %s", sockmgr, msgbuf);
555 }
556
557 static void
558 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
559            isc_logcategory_t *category, isc_logmodule_t *module, int level,
560            isc_msgcat_t *msgcat, int msgset, int message,
561            const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
562 static void
563 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
564            isc_logcategory_t *category, isc_logmodule_t *module, int level,
565            isc_msgcat_t *msgcat, int msgset, int message,
566            const char *fmt, ...)
567 {
568         char msgbuf[2048];
569         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
570         va_list ap;
571
572         if (! isc_log_wouldlog(isc_lctx, level))
573                 return;
574
575         va_start(ap, fmt);
576         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
577         va_end(ap);
578
579         if (address == NULL) {
580                 isc_log_iwrite(isc_lctx, category, module, level,
581                                msgcat, msgset, message,
582                                "socket %p: %s", sock, msgbuf);
583         } else {
584                 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
585                 isc_log_iwrite(isc_lctx, category, module, level,
586                                msgcat, msgset, message,
587                                "socket %p %s: %s", sock, peerbuf, msgbuf);
588         }
589 }
590
591 #if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
592     defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
593 /*
594  * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
595  * setting IPV6_V6ONLY.
596  */
597 static void
598 FIX_IPV6_RECVPKTINFO(isc_socket_t *sock)
599 {
600         char strbuf[ISC_STRERRORSIZE];
601         int on = 1;
602
603         if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
604                 return;
605
606         if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
607                        (void *)&on, sizeof(on)) < 0) {
608
609                 isc__strerror(errno, strbuf, sizeof(strbuf));
610                 UNEXPECTED_ERROR(__FILE__, __LINE__,
611                                  "setsockopt(%d, IPV6_RECVPKTINFO) "
612                                  "%s: %s", sock->fd,
613                                  isc_msgcat_get(isc_msgcat,
614                                                 ISC_MSGSET_GENERAL,
615                                                 ISC_MSG_FAILED,
616                                                 "failed"),
617                                  strbuf);
618         }
619 }
620 #else
621 #define FIX_IPV6_RECVPKTINFO(sock) (void)0
622 #endif
623
624 /*%
625  * Increment socket-related statistics counters.
626  */
627 static inline void
628 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
629         REQUIRE(counterid != -1);
630
631         if (stats != NULL)
632                 isc_stats_increment(stats, counterid);
633 }
634
635 static inline isc_result_t
636 watch_fd(isc_socketmgr_t *manager, int fd, int msg) {
637         isc_result_t result = ISC_R_SUCCESS;
638
639 #ifdef USE_KQUEUE
640         struct kevent evchange;
641
642         memset(&evchange, 0, sizeof(evchange));
643         if (msg == SELECT_POKE_READ)
644                 evchange.filter = EVFILT_READ;
645         else
646                 evchange.filter = EVFILT_WRITE;
647         evchange.flags = EV_ADD;
648         evchange.ident = fd;
649         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
650                 result = isc__errno2result(errno);
651
652         return (result);
653 #elif defined(USE_EPOLL)
654         struct epoll_event event;
655
656         if (msg == SELECT_POKE_READ)
657                 event.events = EPOLLIN;
658         else
659                 event.events = EPOLLOUT;
660         memset(&event.data, 0, sizeof(event.data));
661         event.data.fd = fd;
662         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
663             errno != EEXIST) {
664                 result = isc__errno2result(errno);
665         }
666
667         return (result);
668 #elif defined(USE_DEVPOLL)
669         struct pollfd pfd;
670         int lockid = FDLOCK_ID(fd);
671
672         memset(&pfd, 0, sizeof(pfd));
673         if (msg == SELECT_POKE_READ)
674                 pfd.events = POLLIN;
675         else
676                 pfd.events = POLLOUT;
677         pfd.fd = fd;
678         pfd.revents = 0;
679         LOCK(&manager->fdlock[lockid]);
680         if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
681                 result = isc__errno2result(errno);
682         else {
683                 if (msg == SELECT_POKE_READ)
684                         manager->fdpollinfo[fd].want_read = 1;
685                 else
686                         manager->fdpollinfo[fd].want_write = 1;
687         }
688         UNLOCK(&manager->fdlock[lockid]);
689
690         return (result);
691 #elif defined(USE_SELECT)
692         LOCK(&manager->lock);
693         if (msg == SELECT_POKE_READ)
694                 FD_SET(fd, manager->read_fds);
695         if (msg == SELECT_POKE_WRITE)
696                 FD_SET(fd, manager->write_fds);
697         UNLOCK(&manager->lock);
698
699         return (result);
700 #endif
701 }
702
703 static inline isc_result_t
704 unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) {
705         isc_result_t result = ISC_R_SUCCESS;
706
707 #ifdef USE_KQUEUE
708         struct kevent evchange;
709
710         memset(&evchange, 0, sizeof(evchange));
711         if (msg == SELECT_POKE_READ)
712                 evchange.filter = EVFILT_READ;
713         else
714                 evchange.filter = EVFILT_WRITE;
715         evchange.flags = EV_DELETE;
716         evchange.ident = fd;
717         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
718                 result = isc__errno2result(errno);
719
720         return (result);
721 #elif defined(USE_EPOLL)
722         struct epoll_event event;
723
724         if (msg == SELECT_POKE_READ)
725                 event.events = EPOLLIN;
726         else
727                 event.events = EPOLLOUT;
728         memset(&event.data, 0, sizeof(event.data));
729         event.data.fd = fd;
730         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
731             errno != ENOENT) {
732                 char strbuf[ISC_STRERRORSIZE];
733                 isc__strerror(errno, strbuf, sizeof(strbuf));
734                 UNEXPECTED_ERROR(__FILE__, __LINE__,
735                                  "epoll_ctl(DEL), %d: %s", fd, strbuf);
736                 result = ISC_R_UNEXPECTED;
737         }
738         return (result);
739 #elif defined(USE_DEVPOLL)
740         struct pollfd pfds[2];
741         size_t writelen = sizeof(pfds[0]);
742         int lockid = FDLOCK_ID(fd);
743
744         memset(pfds, 0, sizeof(pfds));
745         pfds[0].events = POLLREMOVE;
746         pfds[0].fd = fd;
747
748         /*
749          * Canceling read or write polling via /dev/poll is tricky.  Since it
750          * only provides a way of canceling per FD, we may need to re-poll the
751          * socket for the other operation.
752          */
753         LOCK(&manager->fdlock[lockid]);
754         if (msg == SELECT_POKE_READ &&
755             manager->fdpollinfo[fd].want_write == 1) {
756                 pfds[1].events = POLLOUT;
757                 pfds[1].fd = fd;
758                 writelen += sizeof(pfds[1]);
759         }
760         if (msg == SELECT_POKE_WRITE &&
761             manager->fdpollinfo[fd].want_read == 1) {
762                 pfds[1].events = POLLIN;
763                 pfds[1].fd = fd;
764                 writelen += sizeof(pfds[1]);
765         }
766
767         if (write(manager->devpoll_fd, pfds, writelen) == -1)
768                 result = isc__errno2result(errno);
769         else {
770                 if (msg == SELECT_POKE_READ)
771                         manager->fdpollinfo[fd].want_read = 0;
772                 else
773                         manager->fdpollinfo[fd].want_write = 0;
774         }
775         UNLOCK(&manager->fdlock[lockid]);
776
777         return (result);
778 #elif defined(USE_SELECT)
779         LOCK(&manager->lock);
780         if (msg == SELECT_POKE_READ)
781                 FD_CLR(fd, manager->read_fds);
782         else if (msg == SELECT_POKE_WRITE)
783                 FD_CLR(fd, manager->write_fds);
784         UNLOCK(&manager->lock);
785
786         return (result);
787 #endif
788 }
789
790 static void
791 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
792         isc_result_t result;
793         int lockid = FDLOCK_ID(fd);
794
795         /*
796          * This is a wakeup on a socket.  If the socket is not in the
797          * process of being closed, start watching it for either reads
798          * or writes.
799          */
800
801         INSIST(fd >= 0 && fd < (int)manager->maxsocks);
802
803         if (msg == SELECT_POKE_CLOSE) {
804                 /* No one should be updating fdstate, so no need to lock it */
805                 INSIST(manager->fdstate[fd] == CLOSE_PENDING);
806                 manager->fdstate[fd] = CLOSED;
807                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
808                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
809                 (void)close(fd);
810                 return;
811         }
812
813         LOCK(&manager->fdlock[lockid]);
814         if (manager->fdstate[fd] == CLOSE_PENDING) {
815                 UNLOCK(&manager->fdlock[lockid]);
816
817                 /*
818                  * We accept (and ignore) any error from unwatch_fd() as we are
819                  * closing the socket, hoping it doesn't leave dangling state in
820                  * the kernel.
821                  * Note that unwatch_fd() must be called after releasing the
822                  * fdlock; otherwise it could cause deadlock due to a lock order
823                  * reversal.
824                  */
825                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
826                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
827                 return;
828         }
829         if (manager->fdstate[fd] != MANAGED) {
830                 UNLOCK(&manager->fdlock[lockid]);
831                 return;
832         }
833         UNLOCK(&manager->fdlock[lockid]);
834
835         /*
836          * Set requested bit.
837          */
838         result = watch_fd(manager, fd, msg);
839         if (result != ISC_R_SUCCESS) {
840                 /*
841                  * XXXJT: what should we do?  Ignoring the failure of watching
842                  * a socket will make the application dysfunctional, but there
843                  * seems to be no reasonable recovery process.
844                  */
845                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
846                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
847                               "failed to start watching FD (%d): %s",
848                               fd, isc_result_totext(result));
849         }
850 }
851
852 #ifdef ISC_PLATFORM_USETHREADS
853 /*
854  * Poke the select loop when there is something for us to do.
855  * The write is required (by POSIX) to complete.  That is, we
856  * will not get partial writes.
857  */
858 static void
859 select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
860         int cc;
861         int buf[2];
862         char strbuf[ISC_STRERRORSIZE];
863
864         buf[0] = fd;
865         buf[1] = msg;
866
867         do {
868                 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
869 #ifdef ENOSR
870                 /*
871                  * Treat ENOSR as EAGAIN but loop slowly as it is
872                  * unlikely to clear fast.
873                  */
874                 if (cc < 0 && errno == ENOSR) {
875                         sleep(1);
876                         errno = EAGAIN;
877                 }
878 #endif
879         } while (cc < 0 && SOFT_ERROR(errno));
880
881         if (cc < 0) {
882                 isc__strerror(errno, strbuf, sizeof(strbuf));
883                 FATAL_ERROR(__FILE__, __LINE__,
884                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
885                                            ISC_MSG_WRITEFAILED,
886                                            "write() failed "
887                                            "during watcher poke: %s"),
888                             strbuf);
889         }
890
891         INSIST(cc == sizeof(buf));
892 }
893
894 /*
895  * Read a message on the internal fd.
896  */
897 static void
898 select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
899         int buf[2];
900         int cc;
901         char strbuf[ISC_STRERRORSIZE];
902
903         cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
904         if (cc < 0) {
905                 *msg = SELECT_POKE_NOTHING;
906                 *fd = -1;       /* Silence compiler. */
907                 if (SOFT_ERROR(errno))
908                         return;
909
910                 isc__strerror(errno, strbuf, sizeof(strbuf));
911                 FATAL_ERROR(__FILE__, __LINE__,
912                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
913                                            ISC_MSG_READFAILED,
914                                            "read() failed "
915                                            "during watcher poke: %s"),
916                             strbuf);
917
918                 return;
919         }
920         INSIST(cc == sizeof(buf));
921
922         *fd = buf[0];
923         *msg = buf[1];
924 }
925 #else /* ISC_PLATFORM_USETHREADS */
926 /*
927  * Update the state of the socketmgr when something changes.
928  */
929 static void
930 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
931         if (msg == SELECT_POKE_SHUTDOWN)
932                 return;
933         else if (fd >= 0)
934                 wakeup_socket(manager, fd, msg);
935         return;
936 }
937 #endif /* ISC_PLATFORM_USETHREADS */
938
939 /*
940  * Make a fd non-blocking.
941  */
942 static isc_result_t
943 make_nonblock(int fd) {
944         int ret;
945         int flags;
946         char strbuf[ISC_STRERRORSIZE];
947 #ifdef USE_FIONBIO_IOCTL
948         int on = 1;
949
950         ret = ioctl(fd, FIONBIO, (char *)&on);
951 #else
952         flags = fcntl(fd, F_GETFL, 0);
953         flags |= PORT_NONBLOCK;
954         ret = fcntl(fd, F_SETFL, flags);
955 #endif
956
957         if (ret == -1) {
958                 isc__strerror(errno, strbuf, sizeof(strbuf));
959                 UNEXPECTED_ERROR(__FILE__, __LINE__,
960 #ifdef USE_FIONBIO_IOCTL
961                                  "ioctl(%d, FIONBIO, &on): %s", fd,
962 #else
963                                  "fcntl(%d, F_SETFL, %d): %s", fd, flags,
964 #endif
965                                  strbuf);
966
967                 return (ISC_R_UNEXPECTED);
968         }
969
970         return (ISC_R_SUCCESS);
971 }
972
973 #ifdef USE_CMSG
974 /*
975  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
976  * In order to ensure as much portability as possible, we provide wrapper
977  * functions of these macros.
978  * Note that cmsg_space() could run slow on OSes that do not have
979  * CMSG_SPACE.
980  */
981 static inline ISC_SOCKADDR_LEN_T
982 cmsg_len(ISC_SOCKADDR_LEN_T len) {
983 #ifdef CMSG_LEN
984         return (CMSG_LEN(len));
985 #else
986         ISC_SOCKADDR_LEN_T hdrlen;
987
988         /*
989          * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
990          * is correct.
991          */
992         hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
993         return (hdrlen + len);
994 #endif
995 }
996
997 static inline ISC_SOCKADDR_LEN_T
998 cmsg_space(ISC_SOCKADDR_LEN_T len) {
999 #ifdef CMSG_SPACE
1000         return (CMSG_SPACE(len));
1001 #else
1002         struct msghdr msg;
1003         struct cmsghdr *cmsgp;
1004         /*
1005          * XXX: The buffer length is an ad-hoc value, but should be enough
1006          * in a practical sense.
1007          */
1008         char dummybuf[sizeof(struct cmsghdr) + 1024];
1009
1010         memset(&msg, 0, sizeof(msg));
1011         msg.msg_control = dummybuf;
1012         msg.msg_controllen = sizeof(dummybuf);
1013
1014         cmsgp = (struct cmsghdr *)dummybuf;
1015         cmsgp->cmsg_len = cmsg_len(len);
1016
1017         cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1018         if (cmsgp != NULL)
1019                 return ((char *)cmsgp - (char *)msg.msg_control);
1020         else
1021                 return (0);
1022 #endif
1023 }
1024 #endif /* USE_CMSG */
1025
1026 /*
1027  * Process control messages received on a socket.
1028  */
1029 static void
1030 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1031 #ifdef USE_CMSG
1032         struct cmsghdr *cmsgp;
1033 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1034         struct in6_pktinfo *pktinfop;
1035 #endif
1036 #ifdef SO_TIMESTAMP
1037         struct timeval *timevalp;
1038 #endif
1039 #endif
1040
1041         /*
1042          * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1043          * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1044          * They are all here, outside of the CPP tests, because it is
1045          * more consistent with the usual ISC coding style.
1046          */
1047         UNUSED(sock);
1048         UNUSED(msg);
1049         UNUSED(dev);
1050
1051 #ifdef ISC_NET_BSD44MSGHDR
1052
1053 #ifdef MSG_TRUNC
1054         if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
1055                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1056 #endif
1057
1058 #ifdef MSG_CTRUNC
1059         if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
1060                 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1061 #endif
1062
1063 #ifndef USE_CMSG
1064         return;
1065 #else
1066         if (msg->msg_controllen == 0U || msg->msg_control == NULL)
1067                 return;
1068
1069 #ifdef SO_TIMESTAMP
1070         timevalp = NULL;
1071 #endif
1072 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1073         pktinfop = NULL;
1074 #endif
1075
1076         cmsgp = CMSG_FIRSTHDR(msg);
1077         while (cmsgp != NULL) {
1078                 socket_log(sock, NULL, TRACE,
1079                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
1080                            "processing cmsg %p", cmsgp);
1081
1082 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1083                 if (cmsgp->cmsg_level == IPPROTO_IPV6
1084                     && cmsgp->cmsg_type == IPV6_PKTINFO) {
1085
1086                         pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1087                         memcpy(&dev->pktinfo, pktinfop,
1088                                sizeof(struct in6_pktinfo));
1089                         dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1090                         socket_log(sock, NULL, TRACE,
1091                                    isc_msgcat, ISC_MSGSET_SOCKET,
1092                                    ISC_MSG_IFRECEIVED,
1093                                    "interface received on ifindex %u",
1094                                    dev->pktinfo.ipi6_ifindex);
1095                         if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
1096                                 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1097                         goto next;
1098                 }
1099 #endif
1100
1101 #ifdef SO_TIMESTAMP
1102                 if (cmsgp->cmsg_level == SOL_SOCKET
1103                     && cmsgp->cmsg_type == SCM_TIMESTAMP) {
1104                         timevalp = (struct timeval *)CMSG_DATA(cmsgp);
1105                         dev->timestamp.seconds = timevalp->tv_sec;
1106                         dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
1107                         dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1108                         goto next;
1109                 }
1110 #endif
1111
1112         next:
1113                 cmsgp = CMSG_NXTHDR(msg, cmsgp);
1114         }
1115 #endif /* USE_CMSG */
1116
1117 #endif /* ISC_NET_BSD44MSGHDR */
1118 }
1119
1120 /*
1121  * Construct an iov array and attach it to the msghdr passed in.  This is
1122  * the SEND constructor, which will use the used region of the buffer
1123  * (if using a buffer list) or will use the internal region (if a single
1124  * buffer I/O is requested).
1125  *
1126  * Nothing can be NULL, and the done event must list at least one buffer
1127  * on the buffer linked list for this function to be meaningful.
1128  *
1129  * If write_countp != NULL, *write_countp will hold the number of bytes
1130  * this transaction can send.
1131  */
1132 static void
1133 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
1134                   struct msghdr *msg, struct iovec *iov, size_t *write_countp)
1135 {
1136         unsigned int iovcount;
1137         isc_buffer_t *buffer;
1138         isc_region_t used;
1139         size_t write_count;
1140         size_t skip_count;
1141
1142         memset(msg, 0, sizeof(*msg));
1143
1144         if (!sock->connected) {
1145                 msg->msg_name = (void *)&dev->address.type.sa;
1146                 msg->msg_namelen = dev->address.length;
1147         } else {
1148                 msg->msg_name = NULL;
1149                 msg->msg_namelen = 0;
1150         }
1151
1152         buffer = ISC_LIST_HEAD(dev->bufferlist);
1153         write_count = 0;
1154         iovcount = 0;
1155
1156         /*
1157          * Single buffer I/O?  Skip what we've done so far in this region.
1158          */
1159         if (buffer == NULL) {
1160                 write_count = dev->region.length - dev->n;
1161                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1162                 iov[0].iov_len = write_count;
1163                 iovcount = 1;
1164
1165                 goto config;
1166         }
1167
1168         /*
1169          * Multibuffer I/O.
1170          * Skip the data in the buffer list that we have already written.
1171          */
1172         skip_count = dev->n;
1173         while (buffer != NULL) {
1174                 REQUIRE(ISC_BUFFER_VALID(buffer));
1175                 if (skip_count < isc_buffer_usedlength(buffer))
1176                         break;
1177                 skip_count -= isc_buffer_usedlength(buffer);
1178                 buffer = ISC_LIST_NEXT(buffer, link);
1179         }
1180
1181         while (buffer != NULL) {
1182                 INSIST(iovcount < MAXSCATTERGATHER_SEND);
1183
1184                 isc_buffer_usedregion(buffer, &used);
1185
1186                 if (used.length > 0) {
1187                         iov[iovcount].iov_base = (void *)(used.base
1188                                                           + skip_count);
1189                         iov[iovcount].iov_len = used.length - skip_count;
1190                         write_count += (used.length - skip_count);
1191                         skip_count = 0;
1192                         iovcount++;
1193                 }
1194                 buffer = ISC_LIST_NEXT(buffer, link);
1195         }
1196
1197         INSIST(skip_count == 0U);
1198
1199  config:
1200         msg->msg_iov = iov;
1201         msg->msg_iovlen = iovcount;
1202
1203 #ifdef ISC_NET_BSD44MSGHDR
1204         msg->msg_control = NULL;
1205         msg->msg_controllen = 0;
1206         msg->msg_flags = 0;
1207 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1208         if ((sock->type == isc_sockettype_udp)
1209             && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
1210 #if defined(IPV6_USE_MIN_MTU)
1211                 int use_min_mtu = 1;    /* -1, 0, 1 */
1212 #endif
1213                 struct cmsghdr *cmsgp;
1214                 struct in6_pktinfo *pktinfop;
1215
1216                 socket_log(sock, NULL, TRACE,
1217                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
1218                            "sendto pktinfo data, ifindex %u",
1219                            dev->pktinfo.ipi6_ifindex);
1220
1221                 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1222                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1223                 msg->msg_control = (void *)sock->sendcmsgbuf;
1224
1225                 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
1226                 cmsgp->cmsg_level = IPPROTO_IPV6;
1227                 cmsgp->cmsg_type = IPV6_PKTINFO;
1228                 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1229                 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1230                 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1231 #if defined(IPV6_USE_MIN_MTU)
1232                 /*
1233                  * Set IPV6_USE_MIN_MTU as a per packet option as FreeBSD
1234                  * ignores setsockopt(IPV6_USE_MIN_MTU) when IPV6_PKTINFO
1235                  * is used.
1236                  */
1237                 cmsgp = (struct cmsghdr *)(sock->sendcmsgbuf +
1238                                            msg->msg_controllen);
1239                 msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
1240                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1241
1242                 cmsgp->cmsg_level = IPPROTO_IPV6;
1243                 cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
1244                 cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
1245                 memcpy(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
1246 #endif
1247         }
1248 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
1249 #else /* ISC_NET_BSD44MSGHDR */
1250         msg->msg_accrights = NULL;
1251         msg->msg_accrightslen = 0;
1252 #endif /* ISC_NET_BSD44MSGHDR */
1253
1254         if (write_countp != NULL)
1255                 *write_countp = write_count;
1256 }
1257
1258 /*
1259  * Construct an iov array and attach it to the msghdr passed in.  This is
1260  * the RECV constructor, which will use the available region of the buffer
1261  * (if using a buffer list) or will use the internal region (if a single
1262  * buffer I/O is requested).
1263  *
1264  * Nothing can be NULL, and the done event must list at least one buffer
1265  * on the buffer linked list for this function to be meaningful.
1266  *
1267  * If read_countp != NULL, *read_countp will hold the number of bytes
1268  * this transaction can receive.
1269  */
1270 static void
1271 build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
1272                   struct msghdr *msg, struct iovec *iov, size_t *read_countp)
1273 {
1274         unsigned int iovcount;
1275         isc_buffer_t *buffer;
1276         isc_region_t available;
1277         size_t read_count;
1278
1279         memset(msg, 0, sizeof(struct msghdr));
1280
1281         if (sock->type == isc_sockettype_udp) {
1282                 memset(&dev->address, 0, sizeof(dev->address));
1283 #ifdef BROKEN_RECVMSG
1284                 if (sock->pf == AF_INET) {
1285                         msg->msg_name = (void *)&dev->address.type.sin;
1286                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1287                 } else if (sock->pf == AF_INET6) {
1288                         msg->msg_name = (void *)&dev->address.type.sin6;
1289                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1290 #ifdef ISC_PLATFORM_HAVESYSUNH
1291                 } else if (sock->pf == AF_UNIX) {
1292                         msg->msg_name = (void *)&dev->address.type.sunix;
1293                         msg->msg_namelen = sizeof(dev->address.type.sunix);
1294 #endif
1295                 } else {
1296                         msg->msg_name = (void *)&dev->address.type.sa;
1297                         msg->msg_namelen = sizeof(dev->address.type);
1298                 }
1299 #else
1300                 msg->msg_name = (void *)&dev->address.type.sa;
1301                 msg->msg_namelen = sizeof(dev->address.type);
1302 #endif
1303 #ifdef ISC_NET_RECVOVERFLOW
1304                 /* If needed, steal one iovec for overflow detection. */
1305                 maxiov--;
1306 #endif
1307         } else { /* TCP */
1308                 msg->msg_name = NULL;
1309                 msg->msg_namelen = 0;
1310                 dev->address = sock->peer_address;
1311         }
1312
1313         buffer = ISC_LIST_HEAD(dev->bufferlist);
1314         read_count = 0;
1315
1316         /*
1317          * Single buffer I/O?  Skip what we've done so far in this region.
1318          */
1319         if (buffer == NULL) {
1320                 read_count = dev->region.length - dev->n;
1321                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1322                 iov[0].iov_len = read_count;
1323                 iovcount = 1;
1324
1325                 goto config;
1326         }
1327
1328         /*
1329          * Multibuffer I/O.
1330          * Skip empty buffers.
1331          */
1332         while (buffer != NULL) {
1333                 REQUIRE(ISC_BUFFER_VALID(buffer));
1334                 if (isc_buffer_availablelength(buffer) != 0)
1335                         break;
1336                 buffer = ISC_LIST_NEXT(buffer, link);
1337         }
1338
1339         iovcount = 0;
1340         while (buffer != NULL) {
1341                 INSIST(iovcount < MAXSCATTERGATHER_RECV);
1342
1343                 isc_buffer_availableregion(buffer, &available);
1344
1345                 if (available.length > 0) {
1346                         iov[iovcount].iov_base = (void *)(available.base);
1347                         iov[iovcount].iov_len = available.length;
1348                         read_count += available.length;
1349                         iovcount++;
1350                 }
1351                 buffer = ISC_LIST_NEXT(buffer, link);
1352         }
1353
1354  config:
1355
1356         /*
1357          * If needed, set up to receive that one extra byte.  Note that
1358          * we know there is at least one iov left, since we stole it
1359          * at the top of this function.
1360          */
1361 #ifdef ISC_NET_RECVOVERFLOW
1362         if (sock->type == isc_sockettype_udp) {
1363                 iov[iovcount].iov_base = (void *)(&sock->overflow);
1364                 iov[iovcount].iov_len = 1;
1365                 iovcount++;
1366         }
1367 #endif
1368
1369         msg->msg_iov = iov;
1370         msg->msg_iovlen = iovcount;
1371
1372 #ifdef ISC_NET_BSD44MSGHDR
1373         msg->msg_control = NULL;
1374         msg->msg_controllen = 0;
1375         msg->msg_flags = 0;
1376 #if defined(USE_CMSG)
1377         if (sock->type == isc_sockettype_udp) {
1378                 msg->msg_control = sock->recvcmsgbuf;
1379                 msg->msg_controllen = sock->recvcmsgbuflen;
1380         }
1381 #endif /* USE_CMSG */
1382 #else /* ISC_NET_BSD44MSGHDR */
1383         msg->msg_accrights = NULL;
1384         msg->msg_accrightslen = 0;
1385 #endif /* ISC_NET_BSD44MSGHDR */
1386
1387         if (read_countp != NULL)
1388                 *read_countp = read_count;
1389 }
1390
1391 static void
1392 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1393                 isc_socketevent_t *dev)
1394 {
1395         if (sock->type == isc_sockettype_udp) {
1396                 if (address != NULL)
1397                         dev->address = *address;
1398                 else
1399                         dev->address = sock->peer_address;
1400         } else if (sock->type == isc_sockettype_tcp) {
1401                 INSIST(address == NULL);
1402                 dev->address = sock->peer_address;
1403         }
1404 }
1405
1406 static void
1407 destroy_socketevent(isc_event_t *event) {
1408         isc_socketevent_t *ev = (isc_socketevent_t *)event;
1409
1410         INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1411
1412         (ev->destroy)(event);
1413 }
1414
1415 static isc_socketevent_t *
1416 allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1417                      isc_taskaction_t action, const void *arg)
1418 {
1419         isc_socketevent_t *ev;
1420
1421         ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1422                                                      sock, eventtype,
1423                                                      action, arg,
1424                                                      sizeof(*ev));
1425
1426         if (ev == NULL)
1427                 return (NULL);
1428
1429         ev->result = ISC_R_UNSET;
1430         ISC_LINK_INIT(ev, ev_link);
1431         ISC_LIST_INIT(ev->bufferlist);
1432         ev->region.base = NULL;
1433         ev->n = 0;
1434         ev->offset = 0;
1435         ev->attributes = 0;
1436         ev->destroy = ev->ev_destroy;
1437         ev->ev_destroy = destroy_socketevent;
1438
1439         return (ev);
1440 }
1441
1442 #if defined(ISC_SOCKET_DEBUG)
1443 static void
1444 dump_msg(struct msghdr *msg) {
1445         unsigned int i;
1446
1447         printf("MSGHDR %p\n", msg);
1448         printf("\tname %p, namelen %ld\n", msg->msg_name,
1449                (long) msg->msg_namelen);
1450         printf("\tiov %p, iovlen %ld\n", msg->msg_iov,
1451                (long) msg->msg_iovlen);
1452         for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1453                 printf("\t\t%d\tbase %p, len %ld\n", i,
1454                        msg->msg_iov[i].iov_base,
1455                        (long) msg->msg_iov[i].iov_len);
1456 #ifdef ISC_NET_BSD44MSGHDR
1457         printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1458                (long) msg->msg_controllen);
1459 #endif
1460 }
1461 #endif
1462
1463 #define DOIO_SUCCESS            0       /* i/o ok, event sent */
1464 #define DOIO_SOFT               1       /* i/o ok, soft error, no event sent */
1465 #define DOIO_HARD               2       /* i/o error, event sent */
1466 #define DOIO_EOF                3       /* EOF, no event sent */
1467
1468 static int
1469 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1470         int cc;
1471         struct iovec iov[MAXSCATTERGATHER_RECV];
1472         size_t read_count;
1473         size_t actual_count;
1474         struct msghdr msghdr;
1475         isc_buffer_t *buffer;
1476         int recv_errno;
1477         char strbuf[ISC_STRERRORSIZE];
1478
1479         build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
1480
1481 #if defined(ISC_SOCKET_DEBUG)
1482         dump_msg(&msghdr);
1483 #endif
1484
1485         cc = recvmsg(sock->fd, &msghdr, 0);
1486         recv_errno = errno;
1487
1488 #if defined(ISC_SOCKET_DEBUG)
1489         dump_msg(&msghdr);
1490 #endif
1491
1492         if (cc < 0) {
1493                 if (SOFT_ERROR(recv_errno))
1494                         return (DOIO_SOFT);
1495
1496                 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1497                         isc__strerror(recv_errno, strbuf, sizeof(strbuf));
1498                         socket_log(sock, NULL, IOEVENT,
1499                                    isc_msgcat, ISC_MSGSET_SOCKET,
1500                                    ISC_MSG_DOIORECV,
1501                                   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1502                                    sock->fd, cc, recv_errno, strbuf);
1503                 }
1504
1505 #define SOFT_OR_HARD(_system, _isc) \
1506         if (recv_errno == _system) { \
1507                 if (sock->connected) { \
1508                         dev->result = _isc; \
1509                         inc_stats(sock->manager->stats, \
1510                                   sock->statsindex[STATID_RECVFAIL]); \
1511                         return (DOIO_HARD); \
1512                 } \
1513                 return (DOIO_SOFT); \
1514         }
1515 #define ALWAYS_HARD(_system, _isc) \
1516         if (recv_errno == _system) { \
1517                 dev->result = _isc; \
1518                 inc_stats(sock->manager->stats, \
1519                           sock->statsindex[STATID_RECVFAIL]); \
1520                 return (DOIO_HARD); \
1521         }
1522
1523                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1524                 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1525                 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1526                 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1527                 /* HPUX 11.11 can return EADDRNOTAVAIL. */
1528                 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1529                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1530                 /*
1531                  * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
1532                  * errors.
1533                  */
1534 #ifdef EPROTO
1535                 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1536 #endif
1537                 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1538
1539 #undef SOFT_OR_HARD
1540 #undef ALWAYS_HARD
1541
1542                 dev->result = isc__errno2result(recv_errno);
1543                 inc_stats(sock->manager->stats,
1544                           sock->statsindex[STATID_RECVFAIL]);
1545                 return (DOIO_HARD);
1546         }
1547
1548         /*
1549          * On TCP and UNIX sockets, zero length reads indicate EOF,
1550          * while on UDP sockets, zero length reads are perfectly valid,
1551          * although strange.
1552          */
1553         switch (sock->type) {
1554         case isc_sockettype_tcp:
1555         case isc_sockettype_unix:
1556                 if (cc == 0)
1557                         return (DOIO_EOF);
1558                 break;
1559         case isc_sockettype_udp:
1560                 break;
1561         case isc_sockettype_fdwatch:
1562         default:
1563                 INSIST(0);
1564         }
1565
1566         if (sock->type == isc_sockettype_udp) {
1567                 dev->address.length = msghdr.msg_namelen;
1568                 if (isc_sockaddr_getport(&dev->address) == 0) {
1569                         if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1570                                 socket_log(sock, &dev->address, IOEVENT,
1571                                            isc_msgcat, ISC_MSGSET_SOCKET,
1572                                            ISC_MSG_ZEROPORT,
1573                                            "dropping source port zero packet");
1574                         }
1575                         return (DOIO_SOFT);
1576                 }
1577         }
1578
1579         socket_log(sock, &dev->address, IOEVENT,
1580                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1581                    "packet received correctly");
1582
1583         /*
1584          * Overflow bit detection.  If we received MORE bytes than we should,
1585          * this indicates an overflow situation.  Set the flag in the
1586          * dev entry and adjust how much we read by one.
1587          */
1588 #ifdef ISC_NET_RECVOVERFLOW
1589         if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1590                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1591                 cc--;
1592         }
1593 #endif
1594
1595         /*
1596          * If there are control messages attached, run through them and pull
1597          * out the interesting bits.
1598          */
1599         if (sock->type == isc_sockettype_udp)
1600                 process_cmsg(sock, &msghdr, dev);
1601
1602         /*
1603          * update the buffers (if any) and the i/o count
1604          */
1605         dev->n += cc;
1606         actual_count = cc;
1607         buffer = ISC_LIST_HEAD(dev->bufferlist);
1608         while (buffer != NULL && actual_count > 0U) {
1609                 REQUIRE(ISC_BUFFER_VALID(buffer));
1610                 if (isc_buffer_availablelength(buffer) <= actual_count) {
1611                         actual_count -= isc_buffer_availablelength(buffer);
1612                         isc_buffer_add(buffer,
1613                                        isc_buffer_availablelength(buffer));
1614                 } else {
1615                         isc_buffer_add(buffer, actual_count);
1616                         actual_count = 0;
1617                         POST(actual_count);
1618                         break;
1619                 }
1620                 buffer = ISC_LIST_NEXT(buffer, link);
1621                 if (buffer == NULL) {
1622                         INSIST(actual_count == 0U);
1623                 }
1624         }
1625
1626         /*
1627          * If we read less than we expected, update counters,
1628          * and let the upper layer poke the descriptor.
1629          */
1630         if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1631                 return (DOIO_SOFT);
1632
1633         /*
1634          * Full reads are posted, or partials if partials are ok.
1635          */
1636         dev->result = ISC_R_SUCCESS;
1637         return (DOIO_SUCCESS);
1638 }
1639
1640 /*
1641  * Returns:
1642  *      DOIO_SUCCESS    The operation succeeded.  dev->result contains
1643  *                      ISC_R_SUCCESS.
1644  *
1645  *      DOIO_HARD       A hard or unexpected I/O error was encountered.
1646  *                      dev->result contains the appropriate error.
1647  *
1648  *      DOIO_SOFT       A soft I/O error was encountered.  No senddone
1649  *                      event was sent.  The operation should be retried.
1650  *
1651  *      No other return values are possible.
1652  */
1653 static int
1654 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1655         int cc;
1656         struct iovec iov[MAXSCATTERGATHER_SEND];
1657         size_t write_count;
1658         struct msghdr msghdr;
1659         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1660         int attempts = 0;
1661         int send_errno;
1662         char strbuf[ISC_STRERRORSIZE];
1663
1664         build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1665
1666  resend:
1667         cc = sendmsg(sock->fd, &msghdr, 0);
1668         send_errno = errno;
1669
1670         /*
1671          * Check for error or block condition.
1672          */
1673         if (cc < 0) {
1674                 if (send_errno == EINTR && ++attempts < NRETRIES)
1675                         goto resend;
1676
1677                 if (SOFT_ERROR(send_errno))
1678                         return (DOIO_SOFT);
1679
1680 #define SOFT_OR_HARD(_system, _isc) \
1681         if (send_errno == _system) { \
1682                 if (sock->connected) { \
1683                         dev->result = _isc; \
1684                         inc_stats(sock->manager->stats, \
1685                                   sock->statsindex[STATID_SENDFAIL]); \
1686                         return (DOIO_HARD); \
1687                 } \
1688                 return (DOIO_SOFT); \
1689         }
1690 #define ALWAYS_HARD(_system, _isc) \
1691         if (send_errno == _system) { \
1692                 dev->result = _isc; \
1693                 inc_stats(sock->manager->stats, \
1694                           sock->statsindex[STATID_SENDFAIL]); \
1695                 return (DOIO_HARD); \
1696         }
1697
1698                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1699                 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1700                 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1701                 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1702                 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1703 #ifdef EHOSTDOWN
1704                 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1705 #endif
1706                 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1707                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1708                 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1709                 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1710                 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1711
1712 #undef SOFT_OR_HARD
1713 #undef ALWAYS_HARD
1714
1715                 /*
1716                  * The other error types depend on whether or not the
1717                  * socket is UDP or TCP.  If it is UDP, some errors
1718                  * that we expect to be fatal under TCP are merely
1719                  * annoying, and are really soft errors.
1720                  *
1721                  * However, these soft errors are still returned as
1722                  * a status.
1723                  */
1724                 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1725                 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1726                 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1727                                  addrbuf, strbuf);
1728                 dev->result = isc__errno2result(send_errno);
1729                 inc_stats(sock->manager->stats,
1730                           sock->statsindex[STATID_SENDFAIL]);
1731                 return (DOIO_HARD);
1732         }
1733
1734         if (cc == 0) {
1735                 inc_stats(sock->manager->stats,
1736                           sock->statsindex[STATID_SENDFAIL]);
1737                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1738                                  "doio_send: send() %s 0",
1739                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1740                                                 ISC_MSG_RETURNED, "returned"));
1741         }
1742
1743         /*
1744          * If we write less than we expected, update counters, poke.
1745          */
1746         dev->n += cc;
1747         if ((size_t)cc != write_count)
1748                 return (DOIO_SOFT);
1749
1750         /*
1751          * Exactly what we wanted to write.  We're done with this
1752          * entry.  Post its completion event.
1753          */
1754         dev->result = ISC_R_SUCCESS;
1755         return (DOIO_SUCCESS);
1756 }
1757
1758 /*
1759  * Kill.
1760  *
1761  * Caller must ensure that the socket is not locked and no external
1762  * references exist.
1763  */
1764 static void
1765 closesocket(isc_socketmgr_t *manager, isc_socket_t *sock, int fd) {
1766         isc_sockettype_t type = sock->type;
1767         int lockid = FDLOCK_ID(fd);
1768
1769         /*
1770          * No one has this socket open, so the watcher doesn't have to be
1771          * poked, and the socket doesn't have to be locked.
1772          */
1773         LOCK(&manager->fdlock[lockid]);
1774         manager->fds[fd] = NULL;
1775         if (type == isc_sockettype_fdwatch)
1776                 manager->fdstate[fd] = CLOSED;
1777         else
1778                 manager->fdstate[fd] = CLOSE_PENDING;
1779         UNLOCK(&manager->fdlock[lockid]);
1780         if (type == isc_sockettype_fdwatch) {
1781                 /*
1782                  * The caller may close the socket once this function returns,
1783                  * and `fd' may be reassigned for a new socket.  So we do
1784                  * unwatch_fd() here, rather than defer it via select_poke().
1785                  * Note: this may complicate data protection among threads and
1786                  * may reduce performance due to additional locks.  One way to
1787                  * solve this would be to dup() the watched descriptor, but we
1788                  * take a simpler approach at this moment.
1789                  */
1790                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1791                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1792         } else
1793                 select_poke(manager, fd, SELECT_POKE_CLOSE);
1794
1795         inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]);
1796
1797         /*
1798          * update manager->maxfd here (XXX: this should be implemented more
1799          * efficiently)
1800          */
1801 #ifdef USE_SELECT
1802         LOCK(&manager->lock);
1803         if (manager->maxfd == fd) {
1804                 int i;
1805
1806                 manager->maxfd = 0;
1807                 for (i = fd - 1; i >= 0; i--) {
1808                         lockid = FDLOCK_ID(i);
1809
1810                         LOCK(&manager->fdlock[lockid]);
1811                         if (manager->fdstate[i] == MANAGED) {
1812                                 manager->maxfd = i;
1813                                 UNLOCK(&manager->fdlock[lockid]);
1814                                 break;
1815                         }
1816                         UNLOCK(&manager->fdlock[lockid]);
1817                 }
1818 #ifdef ISC_PLATFORM_USETHREADS
1819                 if (manager->maxfd < manager->pipe_fds[0])
1820                         manager->maxfd = manager->pipe_fds[0];
1821 #endif
1822         }
1823         UNLOCK(&manager->lock);
1824 #endif  /* USE_SELECT */
1825 }
1826
1827 static void
1828 destroy(isc_socket_t **sockp) {
1829         int fd;
1830         isc_socket_t *sock = *sockp;
1831         isc_socketmgr_t *manager = sock->manager;
1832
1833         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1834                    ISC_MSG_DESTROYING, "destroying");
1835
1836         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1837         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1838         INSIST(ISC_LIST_EMPTY(sock->send_list));
1839         INSIST(sock->connect_ev == NULL);
1840         REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks);
1841
1842         if (sock->fd >= 0) {
1843                 fd = sock->fd;
1844                 sock->fd = -1;
1845                 closesocket(manager, sock, fd);
1846         }
1847
1848         LOCK(&manager->lock);
1849
1850         ISC_LIST_UNLINK(manager->socklist, sock, link);
1851
1852 #ifdef ISC_PLATFORM_USETHREADS
1853         if (ISC_LIST_EMPTY(manager->socklist))
1854                 SIGNAL(&manager->shutdown_ok);
1855 #endif /* ISC_PLATFORM_USETHREADS */
1856
1857         /* can't unlock manager as its memory context is still used */
1858         free_socket(sockp);
1859
1860         UNLOCK(&manager->lock);
1861 }
1862
1863 static isc_result_t
1864 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1865                 isc_socket_t **socketp)
1866 {
1867         isc_socket_t *sock;
1868         isc_result_t result;
1869         ISC_SOCKADDR_LEN_T cmsgbuflen;
1870
1871         sock = isc_mem_get(manager->mctx, sizeof(*sock));
1872
1873         if (sock == NULL)
1874                 return (ISC_R_NOMEMORY);
1875
1876         sock->magic = 0;
1877         sock->references = 0;
1878
1879         sock->manager = manager;
1880         sock->type = type;
1881         sock->fd = -1;
1882         sock->statsindex = NULL;
1883
1884         ISC_LINK_INIT(sock, link);
1885
1886         sock->recvcmsgbuf = NULL;
1887         sock->sendcmsgbuf = NULL;
1888
1889         /*
1890          * set up cmsg buffers
1891          */
1892         cmsgbuflen = 0;
1893 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1894         cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo));
1895 #endif
1896 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
1897         cmsgbuflen += cmsg_space(sizeof(struct timeval));
1898 #endif
1899         sock->recvcmsgbuflen = cmsgbuflen;
1900         if (sock->recvcmsgbuflen != 0U) {
1901                 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1902                 if (sock->recvcmsgbuf == NULL) {
1903                         result = ISC_R_NOMEMORY;
1904                         goto error;
1905                 }
1906         }
1907
1908         cmsgbuflen = 0;
1909 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1910         cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo));
1911 #if defined(IPV6_USE_MIN_MTU)
1912         /*
1913          * Provide space for working around FreeBSD's broken IPV6_USE_MIN_MTU
1914          * support.
1915          */
1916         cmsgbuflen += cmsg_space(sizeof(int));
1917 #endif
1918 #endif
1919         sock->sendcmsgbuflen = cmsgbuflen;
1920         if (sock->sendcmsgbuflen != 0U) {
1921                 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1922                 if (sock->sendcmsgbuf == NULL) {
1923                         result = ISC_R_NOMEMORY;
1924                         goto error;
1925                 }
1926         }
1927
1928         memset(sock->name, 0, sizeof(sock->name));
1929         sock->tag = NULL;
1930
1931         /*
1932          * set up list of readers and writers to be initially empty
1933          */
1934         ISC_LIST_INIT(sock->recv_list);
1935         ISC_LIST_INIT(sock->send_list);
1936         ISC_LIST_INIT(sock->accept_list);
1937         sock->connect_ev = NULL;
1938         sock->pending_recv = 0;
1939         sock->pending_send = 0;
1940         sock->pending_accept = 0;
1941         sock->listener = 0;
1942         sock->connected = 0;
1943         sock->connecting = 0;
1944         sock->bound = 0;
1945
1946         /*
1947          * initialize the lock
1948          */
1949         result = isc_mutex_init(&sock->lock);
1950         if (result != ISC_R_SUCCESS) {
1951                 sock->magic = 0;
1952                 goto error;
1953         }
1954
1955         /*
1956          * Initialize readable and writable events
1957          */
1958         ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1959                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1960                        NULL, sock, sock, NULL, NULL);
1961         ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1962                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1963                        NULL, sock, sock, NULL, NULL);
1964
1965         sock->magic = SOCKET_MAGIC;
1966         *socketp = sock;
1967
1968         return (ISC_R_SUCCESS);
1969
1970  error:
1971         if (sock->recvcmsgbuf != NULL)
1972                 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
1973                             sock->recvcmsgbuflen);
1974         if (sock->sendcmsgbuf != NULL)
1975                 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
1976                             sock->sendcmsgbuflen);
1977         isc_mem_put(manager->mctx, sock, sizeof(*sock));
1978
1979         return (result);
1980 }
1981
1982 /*
1983  * This event requires that the various lists be empty, that the reference
1984  * count be 1, and that the magic number is valid.  The other socket bits,
1985  * like the lock, must be initialized as well.  The fd associated must be
1986  * marked as closed, by setting it to -1 on close, or this routine will
1987  * also close the socket.
1988  */
1989 static void
1990 free_socket(isc_socket_t **socketp) {
1991         isc_socket_t *sock = *socketp;
1992
1993         INSIST(sock->references == 0);
1994         INSIST(VALID_SOCKET(sock));
1995         INSIST(!sock->connecting);
1996         INSIST(!sock->pending_recv);
1997         INSIST(!sock->pending_send);
1998         INSIST(!sock->pending_accept);
1999         INSIST(ISC_LIST_EMPTY(sock->recv_list));
2000         INSIST(ISC_LIST_EMPTY(sock->send_list));
2001         INSIST(ISC_LIST_EMPTY(sock->accept_list));
2002         INSIST(!ISC_LINK_LINKED(sock, link));
2003
2004         if (sock->recvcmsgbuf != NULL)
2005                 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
2006                             sock->recvcmsgbuflen);
2007         if (sock->sendcmsgbuf != NULL)
2008                 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
2009                             sock->sendcmsgbuflen);
2010
2011         sock->magic = 0;
2012
2013         DESTROYLOCK(&sock->lock);
2014
2015         isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
2016
2017         *socketp = NULL;
2018 }
2019
2020 #ifdef SO_BSDCOMPAT
2021 /*
2022  * This really should not be necessary to do.  Having to workout
2023  * which kernel version we are on at run time so that we don't cause
2024  * the kernel to issue a warning about us using a deprecated socket option.
2025  * Such warnings should *never* be on by default in production kernels.
2026  *
2027  * We can't do this a build time because executables are moved between
2028  * machines and hence kernels.
2029  *
2030  * We can't just not set SO_BSDCOMAT because some kernels require it.
2031  */
2032
2033 static isc_once_t         bsdcompat_once = ISC_ONCE_INIT;
2034 isc_boolean_t bsdcompat = ISC_TRUE;
2035
2036 static void
2037 clear_bsdcompat(void) {
2038 #ifdef __linux__
2039          struct utsname buf;
2040          char *endp;
2041          long int major;
2042          long int minor;
2043
2044          uname(&buf);    /* Can only fail if buf is bad in Linux. */
2045
2046          /* Paranoia in parsing can be increased, but we trust uname(). */
2047          major = strtol(buf.release, &endp, 10);
2048          if (*endp == '.') {
2049                 minor = strtol(endp+1, &endp, 10);
2050                 if ((major > 2) || ((major == 2) && (minor >= 4))) {
2051                         bsdcompat = ISC_FALSE;
2052                 }
2053          }
2054 #endif /* __linux __ */
2055 }
2056 #endif
2057
2058 static isc_result_t
2059 opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) {
2060         isc_result_t result;
2061         char strbuf[ISC_STRERRORSIZE];
2062         const char *err = "socket";
2063         int tries = 0;
2064 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
2065         int on = 1;
2066 #endif
2067 #if defined(SO_RCVBUF)
2068         ISC_SOCKADDR_LEN_T optlen;
2069         int size;
2070 #endif
2071
2072  again:
2073         switch (sock->type) {
2074         case isc_sockettype_udp:
2075                 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2076                 break;
2077         case isc_sockettype_tcp:
2078                 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2079                 break;
2080         case isc_sockettype_unix:
2081                 sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2082                 break;
2083         case isc_sockettype_fdwatch:
2084                 /*
2085                  * We should not be called for isc_sockettype_fdwatch sockets.
2086                  */
2087                 INSIST(0);
2088                 break;
2089         }
2090         if (sock->fd == -1 && errno == EINTR && tries++ < 42)
2091                 goto again;
2092
2093 #ifdef F_DUPFD
2094         /*
2095          * Leave a space for stdio and TCP to work in.
2096          */
2097         if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2098             sock->fd >= 0 && sock->fd < manager->reserved) {
2099                 int new, tmp;
2100                 new = fcntl(sock->fd, F_DUPFD, manager->reserved);
2101                 tmp = errno;
2102                 (void)close(sock->fd);
2103                 errno = tmp;
2104                 sock->fd = new;
2105                 err = "isc_socket_create: fcntl/reserved";
2106         } else if (sock->fd >= 0 && sock->fd < 20) {
2107                 int new, tmp;
2108                 new = fcntl(sock->fd, F_DUPFD, 20);
2109                 tmp = errno;
2110                 (void)close(sock->fd);
2111                 errno = tmp;
2112                 sock->fd = new;
2113                 err = "isc_socket_create: fcntl";
2114         }
2115 #endif
2116
2117         if (sock->fd >= (int)manager->maxsocks) {
2118                 (void)close(sock->fd);
2119                 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2120                                ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2121                                isc_msgcat, ISC_MSGSET_SOCKET,
2122                                ISC_MSG_TOOMANYFDS,
2123                                "socket: file descriptor exceeds limit (%d/%u)",
2124                                sock->fd, manager->maxsocks);
2125                 return (ISC_R_NORESOURCES);
2126         }
2127
2128         if (sock->fd < 0) {
2129                 switch (errno) {
2130                 case EMFILE:
2131                 case ENFILE:
2132                         isc__strerror(errno, strbuf, sizeof(strbuf));
2133                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2134                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2135                                        isc_msgcat, ISC_MSGSET_SOCKET,
2136                                        ISC_MSG_TOOMANYFDS,
2137                                        "%s: %s", err, strbuf);
2138                         /* fallthrough */
2139                 case ENOBUFS:
2140                         return (ISC_R_NORESOURCES);
2141
2142                 case EPROTONOSUPPORT:
2143                 case EPFNOSUPPORT:
2144                 case EAFNOSUPPORT:
2145                 /*
2146                  * Linux 2.2 (and maybe others) return EINVAL instead of
2147                  * EAFNOSUPPORT.
2148                  */
2149                 case EINVAL:
2150                         return (ISC_R_FAMILYNOSUPPORT);
2151
2152                 default:
2153                         isc__strerror(errno, strbuf, sizeof(strbuf));
2154                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2155                                          "%s() %s: %s", err,
2156                                          isc_msgcat_get(isc_msgcat,
2157                                                         ISC_MSGSET_GENERAL,
2158                                                         ISC_MSG_FAILED,
2159                                                         "failed"),
2160                                          strbuf);
2161                         return (ISC_R_UNEXPECTED);
2162                 }
2163         }
2164
2165         result = make_nonblock(sock->fd);
2166         if (result != ISC_R_SUCCESS) {
2167                 (void)close(sock->fd);
2168                 return (result);
2169         }
2170
2171 #ifdef SO_BSDCOMPAT
2172         RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
2173                                   clear_bsdcompat) == ISC_R_SUCCESS);
2174         if (sock->type != isc_sockettype_unix && bsdcompat &&
2175             setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
2176                        (void *)&on, sizeof(on)) < 0) {
2177                 isc__strerror(errno, strbuf, sizeof(strbuf));
2178                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2179                                  "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
2180                                  sock->fd,
2181                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2182                                                 ISC_MSG_FAILED, "failed"),
2183                                  strbuf);
2184                 /* Press on... */
2185         }
2186 #endif
2187
2188 #ifdef SO_NOSIGPIPE
2189         if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE,
2190                        (void *)&on, sizeof(on)) < 0) {
2191                 isc__strerror(errno, strbuf, sizeof(strbuf));
2192                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2193                                  "setsockopt(%d, SO_NOSIGPIPE) %s: %s",
2194                                  sock->fd,
2195                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2196                                                 ISC_MSG_FAILED, "failed"),
2197                                  strbuf);
2198                 /* Press on... */
2199         }
2200 #endif
2201
2202 #if defined(USE_CMSG) || defined(SO_RCVBUF)
2203         if (sock->type == isc_sockettype_udp) {
2204
2205 #if defined(USE_CMSG)
2206 #if defined(SO_TIMESTAMP)
2207                 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
2208                                (void *)&on, sizeof(on)) < 0
2209                     && errno != ENOPROTOOPT) {
2210                         isc__strerror(errno, strbuf, sizeof(strbuf));
2211                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2212                                          "setsockopt(%d, SO_TIMESTAMP) %s: %s",
2213                                          sock->fd,
2214                                          isc_msgcat_get(isc_msgcat,
2215                                                         ISC_MSGSET_GENERAL,
2216                                                         ISC_MSG_FAILED,
2217                                                         "failed"),
2218                                          strbuf);
2219                         /* Press on... */
2220                 }
2221 #endif /* SO_TIMESTAMP */
2222
2223 #if defined(ISC_PLATFORM_HAVEIPV6)
2224                 if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
2225                         /*
2226                          * Warn explicitly because this anomaly can be hidden
2227                          * in usual operation (and unexpectedly appear later).
2228                          */
2229                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2230                                          "No buffer available to receive "
2231                                          "IPv6 destination");
2232                 }
2233 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
2234 #ifdef IPV6_RECVPKTINFO
2235                 /* RFC 3542 */
2236                 if ((sock->pf == AF_INET6)
2237                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2238                                    (void *)&on, sizeof(on)) < 0)) {
2239                         isc__strerror(errno, strbuf, sizeof(strbuf));
2240                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2241                                          "setsockopt(%d, IPV6_RECVPKTINFO) "
2242                                          "%s: %s", sock->fd,
2243                                          isc_msgcat_get(isc_msgcat,
2244                                                         ISC_MSGSET_GENERAL,
2245                                                         ISC_MSG_FAILED,
2246                                                         "failed"),
2247                                          strbuf);
2248                 }
2249 #else
2250                 /* RFC 2292 */
2251                 if ((sock->pf == AF_INET6)
2252                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2253                                    (void *)&on, sizeof(on)) < 0)) {
2254                         isc__strerror(errno, strbuf, sizeof(strbuf));
2255                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2256                                          "setsockopt(%d, IPV6_PKTINFO) %s: %s",
2257                                          sock->fd,
2258                                          isc_msgcat_get(isc_msgcat,
2259                                                         ISC_MSGSET_GENERAL,
2260                                                         ISC_MSG_FAILED,
2261                                                         "failed"),
2262                                          strbuf);
2263                 }
2264 #endif /* IPV6_RECVPKTINFO */
2265 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
2266 #ifdef IPV6_USE_MIN_MTU        /* RFC 3542, not too common yet*/
2267                 /* use minimum MTU */
2268                 if (sock->pf == AF_INET6 &&
2269                     setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
2270                                (void *)&on, sizeof(on)) < 0) {
2271                         isc__strerror(errno, strbuf, sizeof(strbuf));
2272                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2273                                          "setsockopt(%d, IPV6_USE_MIN_MTU) "
2274                                          "%s: %s", sock->fd,
2275                                          isc_msgcat_get(isc_msgcat,
2276                                                         ISC_MSGSET_GENERAL,
2277                                                         ISC_MSG_FAILED,
2278                                                         "failed"),
2279                                          strbuf);
2280                 }
2281 #endif
2282 #if defined(IPV6_MTU)
2283                 /*
2284                  * Use minimum MTU on IPv6 sockets.
2285                  */
2286                 if (sock->pf == AF_INET6) {
2287                         int mtu = 1280;
2288                         (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU,
2289                                          &mtu, sizeof(mtu));
2290                 }
2291 #endif
2292 #if defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DONT)
2293                 /*
2294                  * Turn off Path MTU discovery on IPv6/UDP sockets.
2295                  */
2296                 if (sock->pf == AF_INET6) {
2297                         int action = IPV6_PMTUDISC_DONT;
2298                         (void)setsockopt(sock->fd, IPPROTO_IPV6,
2299                                          IPV6_MTU_DISCOVER, &action,
2300                                          sizeof(action));
2301                 }
2302 #endif
2303 #endif /* ISC_PLATFORM_HAVEIPV6 */
2304 #endif /* defined(USE_CMSG) */
2305
2306 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
2307                 /*
2308                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2309                  */
2310                 if (sock->pf == AF_INET) {
2311                         int action = IP_PMTUDISC_DONT;
2312                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2313                                          &action, sizeof(action));
2314                 }
2315 #endif
2316 #if defined(IP_DONTFRAG)
2317                 /*
2318                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2319                  */
2320                 if (sock->pf == AF_INET) {
2321                         int off = 0;
2322                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG,
2323                                          &off, sizeof(off));
2324                 }
2325 #endif
2326
2327 #if defined(SO_RCVBUF)
2328                 optlen = sizeof(size);
2329                 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2330                                (void *)&size, &optlen) >= 0 &&
2331                      size < RCVBUFSIZE) {
2332                         size = RCVBUFSIZE;
2333                         if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2334                                        (void *)&size, sizeof(size)) == -1) {
2335                                 isc__strerror(errno, strbuf, sizeof(strbuf));
2336                                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2337                                         "setsockopt(%d, SO_RCVBUF, %d) %s: %s",
2338                                         sock->fd, size,
2339                                         isc_msgcat_get(isc_msgcat,
2340                                                        ISC_MSGSET_GENERAL,
2341                                                        ISC_MSG_FAILED,
2342                                                        "failed"),
2343                                         strbuf);
2344                         }
2345                 }
2346 #endif
2347         }
2348 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
2349
2350         inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2351
2352         return (ISC_R_SUCCESS);
2353 }
2354
2355 /*%
2356  * Create a new 'type' socket managed by 'manager'.  Events
2357  * will be posted to 'task' and when dispatched 'action' will be
2358  * called with 'arg' as the arg value.  The new socket is returned
2359  * in 'socketp'.
2360  */
2361 isc_result_t
2362 isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
2363                   isc_socket_t **socketp)
2364 {
2365         isc_socket_t *sock = NULL;
2366         isc_result_t result;
2367         int lockid;
2368
2369         REQUIRE(VALID_MANAGER(manager));
2370         REQUIRE(socketp != NULL && *socketp == NULL);
2371         REQUIRE(type != isc_sockettype_fdwatch);
2372
2373         result = allocate_socket(manager, type, &sock);
2374         if (result != ISC_R_SUCCESS)
2375                 return (result);
2376
2377         switch (sock->type) {
2378         case isc_sockettype_udp:
2379                 sock->statsindex =
2380                         (pf == AF_INET) ? upd4statsindex : upd6statsindex;
2381                 break;
2382         case isc_sockettype_tcp:
2383                 sock->statsindex =
2384                         (pf == AF_INET) ? tcp4statsindex : tcp6statsindex;
2385                 break;
2386         case isc_sockettype_unix:
2387                 sock->statsindex = unixstatsindex;
2388                 break;
2389         default:
2390                 INSIST(0);
2391         }
2392
2393         sock->pf = pf;
2394         result = opensocket(manager, sock);
2395         if (result != ISC_R_SUCCESS) {
2396                 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2397                 free_socket(&sock);
2398                 return (result);
2399         }
2400
2401         sock->references = 1;
2402         *socketp = sock;
2403
2404         /*
2405          * Note we don't have to lock the socket like we normally would because
2406          * there are no external references to it yet.
2407          */
2408
2409         lockid = FDLOCK_ID(sock->fd);
2410         LOCK(&manager->fdlock[lockid]);
2411         manager->fds[sock->fd] = sock;
2412         manager->fdstate[sock->fd] = MANAGED;
2413 #ifdef USE_DEVPOLL
2414         INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2415                sock->manager->fdpollinfo[sock->fd].want_write == 0);
2416 #endif
2417         UNLOCK(&manager->fdlock[lockid]);
2418
2419         LOCK(&manager->lock);
2420         ISC_LIST_APPEND(manager->socklist, sock, link);
2421 #ifdef USE_SELECT
2422         if (manager->maxfd < sock->fd)
2423                 manager->maxfd = sock->fd;
2424 #endif
2425         UNLOCK(&manager->lock);
2426
2427         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2428                    ISC_MSG_CREATED, "created");
2429
2430         return (ISC_R_SUCCESS);
2431 }
2432
2433 isc_result_t
2434 isc_socket_open(isc_socket_t *sock) {
2435         isc_result_t result;
2436
2437         REQUIRE(VALID_SOCKET(sock));
2438
2439         LOCK(&sock->lock);
2440         REQUIRE(sock->references == 1);
2441         REQUIRE(sock->type != isc_sockettype_fdwatch);
2442         UNLOCK(&sock->lock);
2443         /*
2444          * We don't need to retain the lock hereafter, since no one else has
2445          * this socket.
2446          */
2447         REQUIRE(sock->fd == -1);
2448
2449         result = opensocket(sock->manager, sock);
2450         if (result != ISC_R_SUCCESS)
2451                 sock->fd = -1;
2452
2453         if (result == ISC_R_SUCCESS) {
2454                 int lockid = FDLOCK_ID(sock->fd);
2455
2456                 LOCK(&sock->manager->fdlock[lockid]);
2457                 sock->manager->fds[sock->fd] = sock;
2458                 sock->manager->fdstate[sock->fd] = MANAGED;
2459 #ifdef USE_DEVPOLL
2460                 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2461                        sock->manager->fdpollinfo[sock->fd].want_write == 0);
2462 #endif
2463                 UNLOCK(&sock->manager->fdlock[lockid]);
2464
2465 #ifdef USE_SELECT
2466                 LOCK(&sock->manager->lock);
2467                 if (sock->manager->maxfd < sock->fd)
2468                         sock->manager->maxfd = sock->fd;
2469                 UNLOCK(&sock->manager->lock);
2470 #endif
2471         }
2472
2473         return (result);
2474 }
2475
2476 /*
2477  * Create a new 'type' socket managed by 'manager'.  Events
2478  * will be posted to 'task' and when dispatched 'action' will be
2479  * called with 'arg' as the arg value.  The new socket is returned
2480  * in 'socketp'.
2481  */
2482 isc_result_t
2483 isc_socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
2484                          isc_sockfdwatch_t callback, void *cbarg,
2485                          isc_task_t *task, isc_socket_t **socketp)
2486 {
2487         isc_socket_t *sock = NULL;
2488         isc_result_t result;
2489         int lockid;
2490
2491         REQUIRE(VALID_MANAGER(manager));
2492         REQUIRE(socketp != NULL && *socketp == NULL);
2493
2494         result = allocate_socket(manager, isc_sockettype_fdwatch, &sock);
2495         if (result != ISC_R_SUCCESS)
2496                 return (result);
2497
2498         sock->fd = fd;
2499         sock->fdwatcharg = cbarg;
2500         sock->fdwatchcb = callback;
2501         sock->fdwatchflags = flags;
2502         sock->fdwatchtask = task;
2503         sock->statsindex = fdwatchstatsindex;
2504
2505         sock->references = 1;
2506         *socketp = sock;
2507
2508         /*
2509          * Note we don't have to lock the socket like we normally would because
2510          * there are no external references to it yet.
2511          */
2512
2513         lockid = FDLOCK_ID(sock->fd);
2514         LOCK(&manager->fdlock[lockid]);
2515         manager->fds[sock->fd] = sock;
2516         manager->fdstate[sock->fd] = MANAGED;
2517         UNLOCK(&manager->fdlock[lockid]);
2518
2519         LOCK(&manager->lock);
2520         ISC_LIST_APPEND(manager->socklist, sock, link);
2521 #ifdef USE_SELECT
2522         if (manager->maxfd < sock->fd)
2523                 manager->maxfd = sock->fd;
2524 #endif
2525         UNLOCK(&manager->lock);
2526
2527         if (flags & ISC_SOCKFDWATCH_READ)
2528                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2529         if (flags & ISC_SOCKFDWATCH_WRITE)
2530                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2531
2532         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2533                    ISC_MSG_CREATED, "fdwatch-created");
2534
2535         return (ISC_R_SUCCESS);
2536 }
2537
2538 /*
2539  * Attach to a socket.  Caller must explicitly detach when it is done.
2540  */
2541 void
2542 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
2543         REQUIRE(VALID_SOCKET(sock));
2544         REQUIRE(socketp != NULL && *socketp == NULL);
2545
2546         LOCK(&sock->lock);
2547         sock->references++;
2548         UNLOCK(&sock->lock);
2549
2550         *socketp = sock;
2551 }
2552
2553 /*
2554  * Dereference a socket.  If this is the last reference to it, clean things
2555  * up by destroying the socket.
2556  */
2557 void
2558 isc_socket_detach(isc_socket_t **socketp) {
2559         isc_socket_t *sock;
2560         isc_boolean_t kill_socket = ISC_FALSE;
2561
2562         REQUIRE(socketp != NULL);
2563         sock = *socketp;
2564         REQUIRE(VALID_SOCKET(sock));
2565
2566         LOCK(&sock->lock);
2567         REQUIRE(sock->references > 0);
2568         sock->references--;
2569         if (sock->references == 0)
2570                 kill_socket = ISC_TRUE;
2571         UNLOCK(&sock->lock);
2572
2573         if (kill_socket)
2574                 destroy(&sock);
2575
2576         *socketp = NULL;
2577 }
2578
2579 isc_result_t
2580 isc_socket_close(isc_socket_t *sock) {
2581         int fd;
2582         isc_socketmgr_t *manager;
2583
2584         REQUIRE(VALID_SOCKET(sock));
2585
2586         LOCK(&sock->lock);
2587
2588         REQUIRE(sock->references == 1);
2589         REQUIRE(sock->type != isc_sockettype_fdwatch);
2590         REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2591
2592         INSIST(!sock->connecting);
2593         INSIST(!sock->pending_recv);
2594         INSIST(!sock->pending_send);
2595         INSIST(!sock->pending_accept);
2596         INSIST(ISC_LIST_EMPTY(sock->recv_list));
2597         INSIST(ISC_LIST_EMPTY(sock->send_list));
2598         INSIST(ISC_LIST_EMPTY(sock->accept_list));
2599         INSIST(sock->connect_ev == NULL);
2600
2601         manager = sock->manager;
2602         fd = sock->fd;
2603         sock->fd = -1;
2604         memset(sock->name, 0, sizeof(sock->name));
2605         sock->tag = NULL;
2606         sock->listener = 0;
2607         sock->connected = 0;
2608         sock->connecting = 0;
2609         sock->bound = 0;
2610         isc_sockaddr_any(&sock->peer_address);
2611
2612         UNLOCK(&sock->lock);
2613
2614         closesocket(manager, sock, fd);
2615
2616         return (ISC_R_SUCCESS);
2617 }
2618
2619 /*
2620  * I/O is possible on a given socket.  Schedule an event to this task that
2621  * will call an internal function to do the I/O.  This will charge the
2622  * task with the I/O operation and let our select loop handler get back
2623  * to doing something real as fast as possible.
2624  *
2625  * The socket and manager must be locked before calling this function.
2626  */
2627 static void
2628 dispatch_recv(isc_socket_t *sock) {
2629         intev_t *iev;
2630         isc_socketevent_t *ev;
2631         isc_task_t *sender;
2632
2633         INSIST(!sock->pending_recv);
2634
2635         if (sock->type != isc_sockettype_fdwatch) {
2636                 ev = ISC_LIST_HEAD(sock->recv_list);
2637                 if (ev == NULL)
2638                         return;
2639                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2640                            "dispatch_recv:  event %p -> task %p",
2641                            ev, ev->ev_sender);
2642                 sender = ev->ev_sender;
2643         } else {
2644                 sender = sock->fdwatchtask;
2645         }
2646
2647         sock->pending_recv = 1;
2648         iev = &sock->readable_ev;
2649
2650         sock->references++;
2651         iev->ev_sender = sock;
2652         if (sock->type == isc_sockettype_fdwatch)
2653                 iev->ev_action = internal_fdwatch_read;
2654         else
2655                 iev->ev_action = internal_recv;
2656         iev->ev_arg = sock;
2657
2658         isc_task_send(sender, (isc_event_t **)&iev);
2659 }
2660
2661 static void
2662 dispatch_send(isc_socket_t *sock) {
2663         intev_t *iev;
2664         isc_socketevent_t *ev;
2665         isc_task_t *sender;
2666
2667         INSIST(!sock->pending_send);
2668
2669         if (sock->type != isc_sockettype_fdwatch) {
2670                 ev = ISC_LIST_HEAD(sock->send_list);
2671                 if (ev == NULL)
2672                         return;
2673                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2674                            "dispatch_send:  event %p -> task %p",
2675                            ev, ev->ev_sender);
2676                 sender = ev->ev_sender;
2677         } else {
2678                 sender = sock->fdwatchtask;
2679         }
2680
2681         sock->pending_send = 1;
2682         iev = &sock->writable_ev;
2683
2684         sock->references++;
2685         iev->ev_sender = sock;
2686         if (sock->type == isc_sockettype_fdwatch)
2687                 iev->ev_action = internal_fdwatch_write;
2688         else
2689                 iev->ev_action = internal_send;
2690         iev->ev_arg = sock;
2691
2692         isc_task_send(sender, (isc_event_t **)&iev);
2693 }
2694
2695 /*
2696  * Dispatch an internal accept event.
2697  */
2698 static void
2699 dispatch_accept(isc_socket_t *sock) {
2700         intev_t *iev;
2701         isc_socket_newconnev_t *ev;
2702
2703         INSIST(!sock->pending_accept);
2704
2705         /*
2706          * Are there any done events left, or were they all canceled
2707          * before the manager got the socket lock?
2708          */
2709         ev = ISC_LIST_HEAD(sock->accept_list);
2710         if (ev == NULL)
2711                 return;
2712
2713         sock->pending_accept = 1;
2714         iev = &sock->readable_ev;
2715
2716         sock->references++;  /* keep socket around for this internal event */
2717         iev->ev_sender = sock;
2718         iev->ev_action = internal_accept;
2719         iev->ev_arg = sock;
2720
2721         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2722 }
2723
2724 static void
2725 dispatch_connect(isc_socket_t *sock) {
2726         intev_t *iev;
2727         isc_socket_connev_t *ev;
2728
2729         iev = &sock->writable_ev;
2730
2731         ev = sock->connect_ev;
2732         INSIST(ev != NULL); /* XXX */
2733
2734         INSIST(sock->connecting);
2735
2736         sock->references++;  /* keep socket around for this internal event */
2737         iev->ev_sender = sock;
2738         iev->ev_action = internal_connect;
2739         iev->ev_arg = sock;
2740
2741         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2742 }
2743
2744 /*
2745  * Dequeue an item off the given socket's read queue, set the result code
2746  * in the done event to the one provided, and send it to the task it was
2747  * destined for.
2748  *
2749  * If the event to be sent is on a list, remove it before sending.  If
2750  * asked to, send and detach from the socket as well.
2751  *
2752  * Caller must have the socket locked if the event is attached to the socket.
2753  */
2754 static void
2755 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2756         isc_task_t *task;
2757
2758         task = (*dev)->ev_sender;
2759
2760         (*dev)->ev_sender = sock;
2761
2762         if (ISC_LINK_LINKED(*dev, ev_link))
2763                 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2764
2765         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
2766             == ISC_SOCKEVENTATTR_ATTACHED)
2767                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
2768         else
2769                 isc_task_send(task, (isc_event_t **)dev);
2770 }
2771
2772 /*
2773  * See comments for send_recvdone_event() above.
2774  *
2775  * Caller must have the socket locked if the event is attached to the socket.
2776  */
2777 static void
2778 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2779         isc_task_t *task;
2780
2781         INSIST(dev != NULL && *dev != NULL);
2782
2783         task = (*dev)->ev_sender;
2784         (*dev)->ev_sender = sock;
2785
2786         if (ISC_LINK_LINKED(*dev, ev_link))
2787                 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
2788
2789         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
2790             == ISC_SOCKEVENTATTR_ATTACHED)
2791                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
2792         else
2793                 isc_task_send(task, (isc_event_t **)dev);
2794 }
2795
2796 /*
2797  * Call accept() on a socket, to get the new file descriptor.  The listen
2798  * socket is used as a prototype to create a new isc_socket_t.  The new
2799  * socket has one outstanding reference.  The task receiving the event
2800  * will be detached from just after the event is delivered.
2801  *
2802  * On entry to this function, the event delivered is the internal
2803  * readable event, and the first item on the accept_list should be
2804  * the done event we want to send.  If the list is empty, this is a no-op,
2805  * so just unlock and return.
2806  */
2807 static void
2808 internal_accept(isc_task_t *me, isc_event_t *ev) {
2809         isc_socket_t *sock;
2810         isc_socketmgr_t *manager;
2811         isc_socket_newconnev_t *dev;
2812         isc_task_t *task;
2813         ISC_SOCKADDR_LEN_T addrlen;
2814         int fd;
2815         isc_result_t result = ISC_R_SUCCESS;
2816         char strbuf[ISC_STRERRORSIZE];
2817         const char *err = "accept";
2818
2819         UNUSED(me);
2820
2821         sock = ev->ev_sender;
2822         INSIST(VALID_SOCKET(sock));
2823
2824         LOCK(&sock->lock);
2825         socket_log(sock, NULL, TRACE,
2826                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2827                    "internal_accept called, locked socket");
2828
2829         manager = sock->manager;
2830         INSIST(VALID_MANAGER(manager));
2831
2832         INSIST(sock->listener);
2833         INSIST(sock->pending_accept == 1);
2834         sock->pending_accept = 0;
2835
2836         INSIST(sock->references > 0);
2837         sock->references--;  /* the internal event is done with this socket */
2838         if (sock->references == 0) {
2839                 UNLOCK(&sock->lock);
2840                 destroy(&sock);
2841                 return;
2842         }
2843
2844         /*
2845          * Get the first item off the accept list.
2846          * If it is empty, unlock the socket and return.
2847          */
2848         dev = ISC_LIST_HEAD(sock->accept_list);
2849         if (dev == NULL) {
2850                 UNLOCK(&sock->lock);
2851                 return;
2852         }
2853
2854         /*
2855          * Try to accept the new connection.  If the accept fails with
2856          * EAGAIN or EINTR, simply poke the watcher to watch this socket
2857          * again.  Also ignore ECONNRESET, which has been reported to
2858          * be spuriously returned on Linux 2.2.19 although it is not
2859          * a documented error for accept().  ECONNABORTED has been
2860          * reported for Solaris 8.  The rest are thrown in not because
2861          * we have seen them but because they are ignored by other
2862          * daemons such as BIND 8 and Apache.
2863          */
2864
2865         addrlen = sizeof(dev->newsocket->peer_address.type);
2866         memset(&dev->newsocket->peer_address.type, 0, addrlen);
2867         fd = accept(sock->fd, &dev->newsocket->peer_address.type.sa,
2868                     (void *)&addrlen);
2869
2870 #ifdef F_DUPFD
2871         /*
2872          * Leave a space for stdio to work in.
2873          */
2874         if (fd >= 0 && fd < 20) {
2875                 int new, tmp;
2876                 new = fcntl(fd, F_DUPFD, 20);
2877                 tmp = errno;
2878                 (void)close(fd);
2879                 errno = tmp;
2880                 fd = new;
2881                 err = "accept/fcntl";
2882         }
2883 #endif
2884
2885         if (fd < 0) {
2886                 if (SOFT_ERROR(errno))
2887                         goto soft_error;
2888                 switch (errno) {
2889                 case ENFILE:
2890                 case EMFILE:
2891                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2892                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2893                                        isc_msgcat, ISC_MSGSET_SOCKET,
2894                                        ISC_MSG_TOOMANYFDS,
2895                                        "%s: too many open file descriptors",
2896                                        err);
2897                         goto soft_error;
2898
2899                 case ENOBUFS:
2900                 case ENOMEM:
2901                 case ECONNRESET:
2902                 case ECONNABORTED:
2903                 case EHOSTUNREACH:
2904                 case EHOSTDOWN:
2905                 case ENETUNREACH:
2906                 case ENETDOWN:
2907                 case ECONNREFUSED:
2908 #ifdef EPROTO
2909                 case EPROTO:
2910 #endif
2911 #ifdef ENONET
2912                 case ENONET:
2913 #endif
2914                         goto soft_error;
2915                 default:
2916                         break;
2917                 }
2918                 isc__strerror(errno, strbuf, sizeof(strbuf));
2919                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2920                                  "internal_accept: %s() %s: %s", err,
2921                                  isc_msgcat_get(isc_msgcat,
2922                                                 ISC_MSGSET_GENERAL,
2923                                                 ISC_MSG_FAILED,
2924                                                 "failed"),
2925                                  strbuf);
2926                 fd = -1;
2927                 result = ISC_R_UNEXPECTED;
2928         } else {
2929                 if (addrlen == 0U) {
2930                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2931                                          "internal_accept(): "
2932                                          "accept() failed to return "
2933                                          "remote address");
2934
2935                         (void)close(fd);
2936                         goto soft_error;
2937                 } else if (dev->newsocket->peer_address.type.sa.sa_family !=
2938                            sock->pf)
2939                 {
2940                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2941                                          "internal_accept(): "
2942                                          "accept() returned peer address "
2943                                          "family %u (expected %u)",
2944                                          dev->newsocket->peer_address.
2945                                          type.sa.sa_family,
2946                                          sock->pf);
2947                         (void)close(fd);
2948                         goto soft_error;
2949                 } else if (fd >= (int)manager->maxsocks) {
2950                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2951                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2952                                        isc_msgcat, ISC_MSGSET_SOCKET,
2953                                        ISC_MSG_TOOMANYFDS,
2954                                        "accept: "
2955                                        "file descriptor exceeds limit (%d/%u)",
2956                                        fd, manager->maxsocks);
2957                         (void)close(fd);
2958                         goto soft_error;
2959                 }
2960         }
2961
2962         if (fd != -1) {
2963                 dev->newsocket->peer_address.length = addrlen;
2964                 dev->newsocket->pf = sock->pf;
2965         }
2966
2967         /*
2968          * Pull off the done event.
2969          */
2970         ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2971
2972         /*
2973          * Poke watcher if there are more pending accepts.
2974          */
2975         if (!ISC_LIST_EMPTY(sock->accept_list))
2976                 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2977
2978         UNLOCK(&sock->lock);
2979
2980         if (fd != -1) {
2981                 result = make_nonblock(fd);
2982                 if (result != ISC_R_SUCCESS) {
2983                         (void)close(fd);
2984                         fd = -1;
2985                 }
2986         }
2987
2988         /*
2989          * -1 means the new socket didn't happen.
2990          */
2991         if (fd != -1) {
2992                 int lockid = FDLOCK_ID(fd);
2993
2994                 LOCK(&manager->fdlock[lockid]);
2995                 manager->fds[fd] = dev->newsocket;
2996                 manager->fdstate[fd] = MANAGED;
2997                 UNLOCK(&manager->fdlock[lockid]);
2998
2999                 LOCK(&manager->lock);
3000                 ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
3001
3002                 dev->newsocket->fd = fd;
3003                 dev->newsocket->bound = 1;
3004                 dev->newsocket->connected = 1;
3005
3006                 /*
3007                  * Save away the remote address
3008                  */
3009                 dev->address = dev->newsocket->peer_address;
3010
3011 #ifdef USE_SELECT
3012                 if (manager->maxfd < fd)
3013                         manager->maxfd = fd;
3014 #endif
3015
3016                 socket_log(sock, &dev->newsocket->peer_address, CREATION,
3017                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
3018                            "accepted connection, new socket %p",
3019                            dev->newsocket);
3020
3021                 UNLOCK(&manager->lock);
3022
3023                 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
3024         } else {
3025                 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3026                 dev->newsocket->references--;
3027                 free_socket(&dev->newsocket);
3028         }
3029
3030         /*
3031          * Fill in the done event details and send it off.
3032          */
3033         dev->result = result;
3034         task = dev->ev_sender;
3035         dev->ev_sender = sock;
3036
3037         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
3038         return;
3039
3040  soft_error:
3041         select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
3042         UNLOCK(&sock->lock);
3043
3044         inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3045         return;
3046 }
3047
3048 static void
3049 internal_recv(isc_task_t *me, isc_event_t *ev) {
3050         isc_socketevent_t *dev;
3051         isc_socket_t *sock;
3052
3053         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3054
3055         sock = ev->ev_sender;
3056         INSIST(VALID_SOCKET(sock));
3057
3058         LOCK(&sock->lock);
3059         socket_log(sock, NULL, IOEVENT,
3060                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3061                    "internal_recv: task %p got event %p", me, ev);
3062
3063         INSIST(sock->pending_recv == 1);
3064         sock->pending_recv = 0;
3065
3066         INSIST(sock->references > 0);
3067         sock->references--;  /* the internal event is done with this socket */
3068         if (sock->references == 0) {
3069                 UNLOCK(&sock->lock);
3070                 destroy(&sock);
3071                 return;
3072         }
3073
3074         /*
3075          * Try to do as much I/O as possible on this socket.  There are no
3076          * limits here, currently.
3077          */
3078         dev = ISC_LIST_HEAD(sock->recv_list);
3079         while (dev != NULL) {
3080                 switch (doio_recv(sock, dev)) {
3081                 case DOIO_SOFT:
3082                         goto poke;
3083
3084                 case DOIO_EOF:
3085                         /*
3086                          * read of 0 means the remote end was closed.
3087                          * Run through the event queue and dispatch all
3088                          * the events with an EOF result code.
3089                          */
3090                         do {
3091                                 dev->result = ISC_R_EOF;
3092                                 send_recvdone_event(sock, &dev);
3093                                 dev = ISC_LIST_HEAD(sock->recv_list);
3094                         } while (dev != NULL);
3095                         goto poke;
3096
3097                 case DOIO_SUCCESS:
3098                 case DOIO_HARD:
3099                         send_recvdone_event(sock, &dev);
3100                         break;
3101                 }
3102
3103                 dev = ISC_LIST_HEAD(sock->recv_list);
3104         }
3105
3106  poke:
3107         if (!ISC_LIST_EMPTY(sock->recv_list))
3108                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3109
3110         UNLOCK(&sock->lock);
3111 }
3112
3113 static void
3114 internal_send(isc_task_t *me, isc_event_t *ev) {
3115         isc_socketevent_t *dev;
3116         isc_socket_t *sock;
3117
3118         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3119
3120         /*
3121          * Find out what socket this is and lock it.
3122          */
3123         sock = (isc_socket_t *)ev->ev_sender;
3124         INSIST(VALID_SOCKET(sock));
3125
3126         LOCK(&sock->lock);
3127         socket_log(sock, NULL, IOEVENT,
3128                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3129                    "internal_send: task %p got event %p", me, ev);
3130
3131         INSIST(sock->pending_send == 1);
3132         sock->pending_send = 0;
3133
3134         INSIST(sock->references > 0);
3135         sock->references--;  /* the internal event is done with this socket */
3136         if (sock->references == 0) {
3137                 UNLOCK(&sock->lock);
3138                 destroy(&sock);
3139                 return;
3140         }
3141
3142         /*
3143          * Try to do as much I/O as possible on this socket.  There are no
3144          * limits here, currently.
3145          */
3146         dev = ISC_LIST_HEAD(sock->send_list);
3147         while (dev != NULL) {
3148                 switch (doio_send(sock, dev)) {
3149                 case DOIO_SOFT:
3150                         goto poke;
3151
3152                 case DOIO_HARD:
3153                 case DOIO_SUCCESS:
3154                         send_senddone_event(sock, &dev);
3155                         break;
3156                 }
3157
3158                 dev = ISC_LIST_HEAD(sock->send_list);
3159         }
3160
3161  poke:
3162         if (!ISC_LIST_EMPTY(sock->send_list))
3163                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3164
3165         UNLOCK(&sock->lock);
3166 }
3167
3168 static void
3169 internal_fdwatch_write(isc_task_t *me, isc_event_t *ev) {
3170         isc_socket_t *sock;
3171         int more_data;
3172
3173         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3174
3175         /*
3176          * Find out what socket this is and lock it.
3177          */
3178         sock = (isc_socket_t *)ev->ev_sender;
3179         INSIST(VALID_SOCKET(sock));
3180
3181         LOCK(&sock->lock);
3182         socket_log(sock, NULL, IOEVENT,
3183                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3184                    "internal_fdwatch_write: task %p got event %p", me, ev);
3185
3186         INSIST(sock->pending_send == 1);
3187
3188         UNLOCK(&sock->lock);
3189         more_data = (sock->fdwatchcb)(me, sock, sock->fdwatcharg);
3190         LOCK(&sock->lock);
3191
3192         sock->pending_send = 0;
3193
3194         INSIST(sock->references > 0);
3195         sock->references--;  /* the internal event is done with this socket */
3196         if (sock->references == 0) {
3197                 UNLOCK(&sock->lock);
3198                 destroy(&sock);
3199                 return;
3200         }
3201
3202         if (more_data)
3203                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3204
3205         UNLOCK(&sock->lock);
3206 }
3207
3208 static void
3209 internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) {
3210         isc_socket_t *sock;
3211         int more_data;
3212
3213         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3214
3215         /*
3216          * Find out what socket this is and lock it.
3217          */
3218         sock = (isc_socket_t *)ev->ev_sender;
3219         INSIST(VALID_SOCKET(sock));
3220
3221         LOCK(&sock->lock);
3222         socket_log(sock, NULL, IOEVENT,
3223                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3224                    "internal_fdwatch_read: task %p got event %p", me, ev);
3225
3226         INSIST(sock->pending_recv == 1);
3227
3228         UNLOCK(&sock->lock);
3229         more_data = (sock->fdwatchcb)(me, sock, sock->fdwatcharg);
3230         LOCK(&sock->lock);
3231
3232         sock->pending_recv = 0;
3233
3234         INSIST(sock->references > 0);
3235         sock->references--;  /* the internal event is done with this socket */
3236         if (sock->references == 0) {
3237                 UNLOCK(&sock->lock);
3238                 destroy(&sock);
3239                 return;
3240         }
3241
3242         if (more_data)
3243                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3244
3245         UNLOCK(&sock->lock);
3246 }
3247
3248 /*
3249  * Process read/writes on each fd here.  Avoid locking
3250  * and unlocking twice if both reads and writes are possible.
3251  */
3252 static void
3253 process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable,
3254            isc_boolean_t writeable)
3255 {
3256         isc_socket_t *sock;
3257         isc_boolean_t unlock_sock;
3258         isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
3259         int lockid = FDLOCK_ID(fd);
3260
3261         /*
3262          * If the socket is going to be closed, don't do more I/O.
3263          */
3264         LOCK(&manager->fdlock[lockid]);
3265         if (manager->fdstate[fd] == CLOSE_PENDING) {
3266                 UNLOCK(&manager->fdlock[lockid]);
3267
3268                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3269                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3270                 return;
3271         }
3272
3273         sock = manager->fds[fd];
3274         unlock_sock = ISC_FALSE;
3275         if (readable) {
3276                 if (sock == NULL) {
3277                         unwatch_read = ISC_TRUE;
3278                         goto check_write;
3279                 }
3280                 unlock_sock = ISC_TRUE;
3281                 LOCK(&sock->lock);
3282                 if (!SOCK_DEAD(sock)) {
3283                         if (sock->listener)
3284                                 dispatch_accept(sock);
3285                         else
3286                                 dispatch_recv(sock);
3287                 }
3288                 unwatch_read = ISC_TRUE;
3289         }
3290 check_write:
3291         if (writeable) {
3292                 if (sock == NULL) {
3293                         unwatch_write = ISC_TRUE;
3294                         goto unlock_fd;
3295                 }
3296                 if (!unlock_sock) {
3297                         unlock_sock = ISC_TRUE;
3298                         LOCK(&sock->lock);
3299                 }
3300                 if (!SOCK_DEAD(sock)) {
3301                         if (sock->connecting)
3302                                 dispatch_connect(sock);
3303                         else
3304                                 dispatch_send(sock);
3305                 }
3306                 unwatch_write = ISC_TRUE;
3307         }
3308         if (unlock_sock)
3309                 UNLOCK(&sock->lock);
3310
3311  unlock_fd:
3312         UNLOCK(&manager->fdlock[lockid]);
3313         if (unwatch_read)
3314                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3315         if (unwatch_write)
3316                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3317
3318 }
3319
3320 #ifdef USE_KQUEUE
3321 static isc_boolean_t
3322 process_fds(isc_socketmgr_t *manager, struct kevent *events, int nevents) {
3323         int i;
3324         isc_boolean_t readable, writable;
3325         isc_boolean_t done = ISC_FALSE;
3326 #ifdef ISC_PLATFORM_USETHREADS
3327         isc_boolean_t have_ctlevent = ISC_FALSE;
3328 #endif
3329
3330         if (nevents == manager->nevents) {
3331                 /*
3332                  * This is not an error, but something unexpected.  If this
3333                  * happens, it may indicate the need for increasing
3334                  * ISC_SOCKET_MAXEVENTS.
3335                  */
3336                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3337                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3338                             "maximum number of FD events (%d) received",
3339                             nevents);
3340         }
3341
3342         for (i = 0; i < nevents; i++) {
3343                 REQUIRE(events[i].ident < manager->maxsocks);
3344 #ifdef ISC_PLATFORM_USETHREADS
3345                 if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) {
3346                         have_ctlevent = ISC_TRUE;
3347                         continue;
3348                 }
3349 #endif
3350                 readable = ISC_TF(events[i].filter == EVFILT_READ);
3351                 writable = ISC_TF(events[i].filter == EVFILT_WRITE);
3352                 process_fd(manager, events[i].ident, readable, writable);
3353         }
3354
3355 #ifdef ISC_PLATFORM_USETHREADS
3356         if (have_ctlevent)
3357                 done = process_ctlfd(manager);
3358 #endif
3359
3360         return (done);
3361 }
3362 #elif defined(USE_EPOLL)
3363 static isc_boolean_t
3364 process_fds(isc_socketmgr_t *manager, struct epoll_event *events, int nevents) {
3365         int i;
3366         isc_boolean_t done = ISC_FALSE;
3367 #ifdef ISC_PLATFORM_USETHREADS
3368         isc_boolean_t have_ctlevent = ISC_FALSE;
3369 #endif
3370
3371         if (nevents == manager->nevents) {
3372                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3373                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3374                             "maximum number of FD events (%d) received",
3375                             nevents);
3376         }
3377
3378         for (i = 0; i < nevents; i++) {
3379                 REQUIRE(events[i].data.fd < (int)manager->maxsocks);
3380 #ifdef ISC_PLATFORM_USETHREADS
3381                 if (events[i].data.fd == manager->pipe_fds[0]) {
3382                         have_ctlevent = ISC_TRUE;
3383                         continue;
3384                 }
3385 #endif
3386                 if ((events[i].events & EPOLLERR) != 0 ||
3387                     (events[i].events & EPOLLHUP) != 0) {
3388                         /*
3389                          * epoll does not set IN/OUT bits on an erroneous
3390                          * condition, so we need to try both anyway.  This is a
3391                          * bit inefficient, but should be okay for such rare
3392                          * events.  Note also that the read or write attempt
3393                          * won't block because we use non-blocking sockets.
3394                          */
3395                         events[i].events |= (EPOLLIN | EPOLLOUT);
3396                 }
3397                 process_fd(manager, events[i].data.fd,
3398                            (events[i].events & EPOLLIN) != 0,
3399                            (events[i].events & EPOLLOUT) != 0);
3400         }
3401
3402 #ifdef ISC_PLATFORM_USETHREADS
3403         if (have_ctlevent)
3404                 done = process_ctlfd(manager);
3405 #endif
3406
3407         return (done);
3408 }
3409 #elif defined(USE_DEVPOLL)
3410 static isc_boolean_t
3411 process_fds(isc_socketmgr_t *manager, struct pollfd *events, int nevents) {
3412         int i;
3413         isc_boolean_t done = ISC_FALSE;
3414 #ifdef ISC_PLATFORM_USETHREADS
3415         isc_boolean_t have_ctlevent = ISC_FALSE;
3416 #endif
3417
3418         if (nevents == manager->nevents) {
3419                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3420                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3421                             "maximum number of FD events (%d) received",
3422                             nevents);
3423         }
3424
3425         for (i = 0; i < nevents; i++) {
3426                 REQUIRE(events[i].fd < (int)manager->maxsocks);
3427 #ifdef ISC_PLATFORM_USETHREADS
3428                 if (events[i].fd == manager->pipe_fds[0]) {
3429                         have_ctlevent = ISC_TRUE;
3430                         continue;
3431                 }
3432 #endif
3433                 process_fd(manager, events[i].fd,
3434                            (events[i].events & POLLIN) != 0,
3435                            (events[i].events & POLLOUT) != 0);
3436         }
3437
3438 #ifdef ISC_PLATFORM_USETHREADS
3439         if (have_ctlevent)
3440                 done = process_ctlfd(manager);
3441 #endif
3442
3443         return (done);
3444 }
3445 #elif defined(USE_SELECT)
3446 static void
3447 process_fds(isc_socketmgr_t *manager, int maxfd,
3448             fd_set *readfds, fd_set *writefds)
3449 {
3450         int i;
3451
3452         REQUIRE(maxfd <= (int)manager->maxsocks);
3453
3454         for (i = 0; i < maxfd; i++) {
3455 #ifdef ISC_PLATFORM_USETHREADS
3456                 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
3457                         continue;
3458 #endif /* ISC_PLATFORM_USETHREADS */
3459                 process_fd(manager, i, FD_ISSET(i, readfds),
3460                            FD_ISSET(i, writefds));
3461         }
3462 }
3463 #endif
3464
3465 #ifdef ISC_PLATFORM_USETHREADS
3466 static isc_boolean_t
3467 process_ctlfd(isc_socketmgr_t *manager) {
3468         int msg, fd;
3469
3470         for (;;) {
3471                 select_readmsg(manager, &fd, &msg);
3472
3473                 manager_log(manager, IOEVENT,
3474                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
3475                                            ISC_MSG_WATCHERMSG,
3476                                            "watcher got message %d "
3477                                            "for socket %d"), msg, fd);
3478
3479                 /*
3480                  * Nothing to read?
3481                  */
3482                 if (msg == SELECT_POKE_NOTHING)
3483                         break;
3484
3485                 /*
3486                  * Handle shutdown message.  We really should
3487                  * jump out of this loop right away, but
3488                  * it doesn't matter if we have to do a little
3489                  * more work first.
3490                  */
3491                 if (msg == SELECT_POKE_SHUTDOWN)
3492                         return (ISC_TRUE);
3493
3494                 /*
3495                  * This is a wakeup on a socket.  Look
3496                  * at the event queue for both read and write,
3497                  * and decide if we need to watch on it now
3498                  * or not.
3499                  */
3500                 wakeup_socket(manager, fd, msg);
3501         }
3502
3503         return (ISC_FALSE);
3504 }
3505
3506 /*
3507  * This is the thread that will loop forever, always in a select or poll
3508  * call.
3509  *
3510  * When select returns something to do, track down what thread gets to do
3511  * this I/O and post the event to it.
3512  */
3513 static isc_threadresult_t
3514 watcher(void *uap) {
3515         isc_socketmgr_t *manager = uap;
3516         isc_boolean_t done;
3517         int cc;
3518 #ifdef USE_KQUEUE
3519         const char *fnname = "kevent()";
3520 #elif defined (USE_EPOLL)
3521         const char *fnname = "epoll_wait()";
3522 #elif defined(USE_DEVPOLL)
3523         const char *fnname = "ioctl(DP_POLL)";
3524         struct dvpoll dvp;
3525 #elif defined (USE_SELECT)
3526         const char *fnname = "select()";
3527         int maxfd;
3528         int ctlfd;
3529 #endif
3530         char strbuf[ISC_STRERRORSIZE];
3531 #ifdef ISC_SOCKET_USE_POLLWATCH
3532         pollstate_t pollstate = poll_idle;
3533 #endif
3534
3535 #if defined (USE_SELECT)
3536         /*
3537          * Get the control fd here.  This will never change.
3538          */
3539         ctlfd = manager->pipe_fds[0];
3540 #endif
3541         done = ISC_FALSE;
3542         while (!done) {
3543                 do {
3544 #ifdef USE_KQUEUE
3545                         cc = kevent(manager->kqueue_fd, NULL, 0,
3546                                     manager->events, manager->nevents, NULL);
3547 #elif defined(USE_EPOLL)
3548                         cc = epoll_wait(manager->epoll_fd, manager->events,
3549                                         manager->nevents, -1);
3550 #elif defined(USE_DEVPOLL)
3551                         dvp.dp_fds = manager->events;
3552                         dvp.dp_nfds = manager->nevents;
3553 #ifndef ISC_SOCKET_USE_POLLWATCH
3554                         dvp.dp_timeout = -1;
3555 #else
3556                         if (pollstate == poll_idle)
3557                                 dvp.dp_timeout = -1;
3558                         else
3559                                 dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT;
3560 #endif  /* ISC_SOCKET_USE_POLLWATCH */
3561                         cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
3562 #elif defined(USE_SELECT)
3563                         LOCK(&manager->lock);
3564                         memcpy(manager->read_fds_copy, manager->read_fds,
3565                                manager->fd_bufsize);
3566                         memcpy(manager->write_fds_copy, manager->write_fds,
3567                                manager->fd_bufsize);
3568                         maxfd = manager->maxfd + 1;
3569                         UNLOCK(&manager->lock);
3570
3571                         cc = select(maxfd, manager->read_fds_copy,
3572                                     manager->write_fds_copy, NULL, NULL);
3573 #endif  /* USE_KQUEUE */
3574
3575                         if (cc < 0 && !SOFT_ERROR(errno)) {
3576                                 isc__strerror(errno, strbuf, sizeof(strbuf));
3577                                 FATAL_ERROR(__FILE__, __LINE__,
3578                                             "%s %s: %s", fnname,
3579                                             isc_msgcat_get(isc_msgcat,
3580                                                            ISC_MSGSET_GENERAL,
3581                                                            ISC_MSG_FAILED,
3582                                                            "failed"), strbuf);
3583                         }
3584
3585 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3586                         if (cc == 0) {
3587                                 if (pollstate == poll_active)
3588                                         pollstate = poll_checking;
3589                                 else if (pollstate == poll_checking)
3590                                         pollstate = poll_idle;
3591                         } else if (cc > 0) {
3592                                 if (pollstate == poll_checking) {
3593                                         /*
3594                                          * XXX: We'd like to use a more
3595                                          * verbose log level as it's actually an
3596                                          * unexpected event, but the kernel bug
3597                                          * reportedly happens pretty frequently
3598                                          * (and it can also be a false positive)
3599                                          * so it would be just too noisy.
3600                                          */
3601                                         manager_log(manager,
3602                                                     ISC_LOGCATEGORY_GENERAL,
3603                                                     ISC_LOGMODULE_SOCKET,
3604                                                     ISC_LOG_DEBUG(1),
3605                                                     "unexpected POLL timeout");
3606                                 }
3607                                 pollstate = poll_active;
3608                         }
3609 #endif
3610                 } while (cc < 0);
3611
3612 #if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL)
3613                 done = process_fds(manager, manager->events, cc);
3614 #elif defined(USE_SELECT)
3615                 process_fds(manager, maxfd, manager->read_fds_copy,
3616                             manager->write_fds_copy);
3617
3618                 /*
3619                  * Process reads on internal, control fd.
3620                  */
3621                 if (FD_ISSET(ctlfd, manager->read_fds_copy))
3622                         done = process_ctlfd(manager);
3623 #endif
3624         }
3625
3626         manager_log(manager, TRACE, "%s",
3627                     isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3628                                    ISC_MSG_EXITING, "watcher exiting"));
3629
3630         return ((isc_threadresult_t)0);
3631 }
3632 #endif /* ISC_PLATFORM_USETHREADS */
3633
3634 void
3635 isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
3636
3637         REQUIRE(VALID_MANAGER(manager));
3638
3639         manager->reserved = reserved;
3640 }
3641
3642 /*
3643  * Create a new socket manager.
3644  */
3645
3646 static isc_result_t
3647 setup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
3648         isc_result_t result;
3649 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3650         char strbuf[ISC_STRERRORSIZE];
3651 #endif
3652
3653 #ifdef USE_KQUEUE
3654         manager->nevents = ISC_SOCKET_MAXEVENTS;
3655         manager->events = isc_mem_get(mctx, sizeof(struct kevent) *
3656                                       manager->nevents);
3657         if (manager->events == NULL)
3658                 return (ISC_R_NOMEMORY);
3659         manager->kqueue_fd = kqueue();
3660         if (manager->kqueue_fd == -1) {
3661                 result = isc__errno2result(errno);
3662                 isc__strerror(errno, strbuf, sizeof(strbuf));
3663                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3664                                  "kqueue %s: %s",
3665                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3666                                                 ISC_MSG_FAILED, "failed"),
3667                                  strbuf);
3668                 isc_mem_put(mctx, manager->events,
3669                             sizeof(struct kevent) * manager->nevents);
3670                 return (result);
3671         }
3672
3673 #ifdef ISC_PLATFORM_USETHREADS
3674         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3675         if (result != ISC_R_SUCCESS) {
3676                 close(manager->kqueue_fd);
3677                 isc_mem_put(mctx, manager->events,
3678                             sizeof(struct kevent) * manager->nevents);
3679                 return (result);
3680         }
3681 #endif  /* ISC_PLATFORM_USETHREADS */
3682 #elif defined(USE_EPOLL)
3683         manager->nevents = ISC_SOCKET_MAXEVENTS;
3684         manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) *
3685                                       manager->nevents);
3686         if (manager->events == NULL)
3687                 return (ISC_R_NOMEMORY);
3688         manager->epoll_fd = epoll_create(manager->nevents);
3689         if (manager->epoll_fd == -1) {
3690                 result = isc__errno2result(errno);
3691                 isc__strerror(errno, strbuf, sizeof(strbuf));
3692                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3693                                  "epoll_create %s: %s",
3694                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3695                                                 ISC_MSG_FAILED, "failed"),
3696                                  strbuf);
3697                 isc_mem_put(mctx, manager->events,
3698                             sizeof(struct epoll_event) * manager->nevents);
3699                 return (result);
3700         }
3701 #ifdef ISC_PLATFORM_USETHREADS
3702         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3703         if (result != ISC_R_SUCCESS) {
3704                 close(manager->epoll_fd);
3705                 isc_mem_put(mctx, manager->events,
3706                             sizeof(struct epoll_event) * manager->nevents);
3707                 return (result);
3708         }
3709 #endif  /* ISC_PLATFORM_USETHREADS */
3710 #elif defined(USE_DEVPOLL)
3711         /*
3712          * XXXJT: /dev/poll seems to reject large numbers of events,
3713          * so we should be careful about redefining ISC_SOCKET_MAXEVENTS.
3714          */
3715         manager->nevents = ISC_SOCKET_MAXEVENTS;
3716         manager->events = isc_mem_get(mctx, sizeof(struct pollfd) *
3717                                       manager->nevents);
3718         if (manager->events == NULL)
3719                 return (ISC_R_NOMEMORY);
3720         /*
3721          * Note: fdpollinfo should be able to support all possible FDs, so
3722          * it must have maxsocks entries (not nevents).
3723          */
3724         manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) *
3725                                           manager->maxsocks);
3726         if (manager->fdpollinfo == NULL) {
3727                 isc_mem_put(mctx, manager->events,
3728                             sizeof(struct pollfd) * manager->nevents);
3729                 return (ISC_R_NOMEMORY);
3730         }
3731         memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks);
3732         manager->devpoll_fd = open("/dev/poll", O_RDWR);
3733         if (manager->devpoll_fd == -1) {
3734                 result = isc__errno2result(errno);
3735                 isc__strerror(errno, strbuf, sizeof(strbuf));
3736                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3737                                  "open(/dev/poll) %s: %s",
3738                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3739                                                 ISC_MSG_FAILED, "failed"),
3740                                  strbuf);
3741                 isc_mem_put(mctx, manager->events,
3742                             sizeof(struct pollfd) * manager->nevents);
3743                 isc_mem_put(mctx, manager->fdpollinfo,
3744                             sizeof(pollinfo_t) * manager->maxsocks);
3745                 return (result);
3746         }
3747 #ifdef ISC_PLATFORM_USETHREADS
3748         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3749         if (result != ISC_R_SUCCESS) {
3750                 close(manager->devpoll_fd);
3751                 isc_mem_put(mctx, manager->events,
3752                             sizeof(struct pollfd) * manager->nevents);
3753                 isc_mem_put(mctx, manager->fdpollinfo,
3754                             sizeof(pollinfo_t) * manager->maxsocks);
3755                 return (result);
3756         }
3757 #endif  /* ISC_PLATFORM_USETHREADS */
3758 #elif defined(USE_SELECT)
3759         UNUSED(result);
3760
3761 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3762         /*
3763          * Note: this code should also cover the case of MAXSOCKETS <=
3764          * FD_SETSIZE, but we separate the cases to avoid possible portability
3765          * issues regarding howmany() and the actual representation of fd_set.
3766          */
3767         manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
3768                 sizeof(fd_mask);
3769 #else
3770         manager->fd_bufsize = sizeof(fd_set);
3771 #endif
3772
3773         manager->read_fds = NULL;
3774         manager->read_fds_copy = NULL;
3775         manager->write_fds = NULL;
3776         manager->write_fds_copy = NULL;
3777
3778         manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
3779         if (manager->read_fds != NULL)
3780                 manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
3781         if (manager->read_fds_copy != NULL)
3782                 manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
3783         if (manager->write_fds != NULL) {
3784                 manager->write_fds_copy = isc_mem_get(mctx,
3785                                                       manager->fd_bufsize);
3786         }
3787         if (manager->write_fds_copy == NULL) {
3788                 if (manager->write_fds != NULL) {
3789                         isc_mem_put(mctx, manager->write_fds,
3790                                     manager->fd_bufsize);
3791                 }
3792                 if (manager->read_fds_copy != NULL) {
3793                         isc_mem_put(mctx, manager->read_fds_copy,
3794                                     manager->fd_bufsize);
3795                 }
3796                 if (manager->read_fds != NULL) {
3797                         isc_mem_put(mctx, manager->read_fds,
3798                                     manager->fd_bufsize);
3799                 }
3800                 return (ISC_R_NOMEMORY);
3801         }
3802         memset(manager->read_fds, 0, manager->fd_bufsize);
3803         memset(manager->write_fds, 0, manager->fd_bufsize);
3804
3805 #ifdef ISC_PLATFORM_USETHREADS
3806         (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3807         manager->maxfd = manager->pipe_fds[0];
3808 #else /* ISC_PLATFORM_USETHREADS */
3809         manager->maxfd = 0;
3810 #endif /* ISC_PLATFORM_USETHREADS */
3811 #endif  /* USE_KQUEUE */
3812
3813         return (ISC_R_SUCCESS);
3814 }
3815
3816 static void
3817 cleanup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
3818 #ifdef ISC_PLATFORM_USETHREADS
3819         isc_result_t result;
3820
3821         result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3822         if (result != ISC_R_SUCCESS) {
3823                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3824                                  "epoll_ctl(DEL) %s",
3825                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3826                                                 ISC_MSG_FAILED, "failed"));
3827         }
3828 #endif  /* ISC_PLATFORM_USETHREADS */
3829
3830 #ifdef USE_KQUEUE
3831         close(manager->kqueue_fd);
3832         isc_mem_put(mctx, manager->events,
3833                     sizeof(struct kevent) * manager->nevents);
3834 #elif defined(USE_EPOLL)
3835         close(manager->epoll_fd);
3836         isc_mem_put(mctx, manager->events,
3837                     sizeof(struct epoll_event) * manager->nevents);
3838 #elif defined(USE_DEVPOLL)
3839         close(manager->devpoll_fd);
3840         isc_mem_put(mctx, manager->events,
3841                     sizeof(struct pollfd) * manager->nevents);
3842         isc_mem_put(mctx, manager->fdpollinfo,
3843                     sizeof(pollinfo_t) * manager->maxsocks);
3844 #elif defined(USE_SELECT)
3845         if (manager->read_fds != NULL)
3846                 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
3847         if (manager->read_fds_copy != NULL)
3848                 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize);
3849         if (manager->write_fds != NULL)
3850                 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
3851         if (manager->write_fds_copy != NULL)
3852                 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
3853 #endif  /* USE_KQUEUE */
3854 }
3855
3856 isc_result_t
3857 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
3858         return (isc_socketmgr_create2(mctx, managerp, 0));
3859 }
3860
3861 isc_result_t
3862 isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
3863                       unsigned int maxsocks)
3864 {
3865         int i;
3866         isc_socketmgr_t *manager;
3867 #ifdef ISC_PLATFORM_USETHREADS
3868         char strbuf[ISC_STRERRORSIZE];
3869 #endif
3870         isc_result_t result;
3871
3872         REQUIRE(managerp != NULL && *managerp == NULL);
3873
3874 #ifndef ISC_PLATFORM_USETHREADS
3875         if (socketmgr != NULL) {
3876                 /* Don't allow maxsocks to be updated */
3877                 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
3878                         return (ISC_R_EXISTS);
3879
3880                 socketmgr->refs++;
3881                 *managerp = socketmgr;
3882                 return (ISC_R_SUCCESS);
3883         }
3884 #endif /* ISC_PLATFORM_USETHREADS */
3885
3886         if (maxsocks == 0)
3887                 maxsocks = ISC_SOCKET_MAXSOCKETS;
3888
3889         manager = isc_mem_get(mctx, sizeof(*manager));
3890         if (manager == NULL)
3891                 return (ISC_R_NOMEMORY);
3892
3893         /* zero-clear so that necessary cleanup on failure will be easy */
3894         memset(manager, 0, sizeof(*manager));
3895         manager->maxsocks = maxsocks;
3896         manager->reserved = 0;
3897         manager->fds = isc_mem_get(mctx,
3898                                    manager->maxsocks * sizeof(isc_socket_t *));
3899         if (manager->fds == NULL) {
3900                 result = ISC_R_NOMEMORY;
3901                 goto free_manager;
3902         }
3903         manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int));
3904         if (manager->fdstate == NULL) {
3905                 result = ISC_R_NOMEMORY;
3906                 goto free_manager;
3907         }
3908         manager->stats = NULL;
3909
3910         manager->magic = SOCKET_MANAGER_MAGIC;
3911         manager->mctx = NULL;
3912         memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
3913         ISC_LIST_INIT(manager->socklist);
3914         result = isc_mutex_init(&manager->lock);
3915         if (result != ISC_R_SUCCESS)
3916                 goto free_manager;
3917         manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t));
3918         if (manager->fdlock == NULL) {
3919                 result = ISC_R_NOMEMORY;
3920                 goto cleanup_lock;
3921         }
3922         for (i = 0; i < FDLOCK_COUNT; i++) {
3923                 result = isc_mutex_init(&manager->fdlock[i]);
3924                 if (result != ISC_R_SUCCESS) {
3925                         while (--i >= 0)
3926                                 DESTROYLOCK(&manager->fdlock[i]);
3927                         isc_mem_put(mctx, manager->fdlock,
3928                                     FDLOCK_COUNT * sizeof(isc_mutex_t));
3929                         manager->fdlock = NULL;
3930                         goto cleanup_lock;
3931                 }
3932         }
3933
3934 #ifdef ISC_PLATFORM_USETHREADS
3935         if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
3936                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3937                                  "isc_condition_init() %s",
3938                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3939                                                 ISC_MSG_FAILED, "failed"));
3940                 result = ISC_R_UNEXPECTED;
3941                 goto cleanup_lock;
3942         }
3943
3944         /*
3945          * Create the special fds that will be used to wake up the
3946          * select/poll loop when something internal needs to be done.
3947          */
3948         if (pipe(manager->pipe_fds) != 0) {
3949                 isc__strerror(errno, strbuf, sizeof(strbuf));
3950                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3951                                  "pipe() %s: %s",
3952                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3953                                                 ISC_MSG_FAILED, "failed"),
3954                                  strbuf);
3955                 result = ISC_R_UNEXPECTED;
3956                 goto cleanup_condition;
3957         }
3958
3959         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
3960 #if 0
3961         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
3962 #endif
3963 #else /* ISC_PLATFORM_USETHREADS */
3964         manager->refs = 1;
3965 #endif /* ISC_PLATFORM_USETHREADS */
3966
3967         /*
3968          * Set up initial state for the select loop
3969          */
3970         result = setup_watcher(mctx, manager);
3971         if (result != ISC_R_SUCCESS)
3972                 goto cleanup;
3973         memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
3974 #ifdef ISC_PLATFORM_USETHREADS
3975         /*
3976          * Start up the select/poll thread.
3977          */
3978         if (isc_thread_create(watcher, manager, &manager->watcher) !=
3979             ISC_R_SUCCESS) {
3980                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3981                                  "isc_thread_create() %s",
3982                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3983                                                 ISC_MSG_FAILED, "failed"));
3984                 cleanup_watcher(mctx, manager);
3985                 result = ISC_R_UNEXPECTED;
3986                 goto cleanup;
3987         }
3988 #endif /* ISC_PLATFORM_USETHREADS */
3989         isc_mem_attach(mctx, &manager->mctx);
3990
3991 #ifndef ISC_PLATFORM_USETHREADS
3992         socketmgr = manager;
3993 #endif /* ISC_PLATFORM_USETHREADS */
3994         *managerp = manager;
3995
3996         return (ISC_R_SUCCESS);
3997
3998 cleanup:
3999 #ifdef ISC_PLATFORM_USETHREADS
4000         (void)close(manager->pipe_fds[0]);
4001         (void)close(manager->pipe_fds[1]);
4002 #endif  /* ISC_PLATFORM_USETHREADS */
4003
4004 #ifdef ISC_PLATFORM_USETHREADS
4005 cleanup_condition:
4006         (void)isc_condition_destroy(&manager->shutdown_ok);
4007 #endif  /* ISC_PLATFORM_USETHREADS */
4008
4009
4010 cleanup_lock:
4011         if (manager->fdlock != NULL) {
4012                 for (i = 0; i < FDLOCK_COUNT; i++)
4013                         DESTROYLOCK(&manager->fdlock[i]);
4014         }
4015         DESTROYLOCK(&manager->lock);
4016
4017 free_manager:
4018         if (manager->fdlock != NULL) {
4019                 isc_mem_put(mctx, manager->fdlock,
4020                             FDLOCK_COUNT * sizeof(isc_mutex_t));
4021         }
4022         if (manager->fdstate != NULL) {
4023                 isc_mem_put(mctx, manager->fdstate,
4024                             manager->maxsocks * sizeof(int));
4025         }
4026         if (manager->fds != NULL) {
4027                 isc_mem_put(mctx, manager->fds,
4028                             manager->maxsocks * sizeof(isc_socket_t *));
4029         }
4030         isc_mem_put(mctx, manager, sizeof(*manager));
4031
4032         return (result);
4033 }
4034
4035 isc_result_t
4036 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
4037         REQUIRE(VALID_MANAGER(manager));
4038         REQUIRE(nsockp != NULL);
4039
4040         *nsockp = manager->maxsocks;
4041
4042         return (ISC_R_SUCCESS);
4043 }
4044
4045 void
4046 isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
4047         REQUIRE(VALID_MANAGER(manager));
4048         REQUIRE(ISC_LIST_EMPTY(manager->socklist));
4049         REQUIRE(manager->stats == NULL);
4050         REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
4051
4052         isc_stats_attach(stats, &manager->stats);
4053 }
4054
4055 void
4056 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
4057         isc_socketmgr_t *manager;
4058         int i;
4059         isc_mem_t *mctx;
4060
4061         /*
4062          * Destroy a socket manager.
4063          */
4064
4065         REQUIRE(managerp != NULL);
4066         manager = *managerp;
4067         REQUIRE(VALID_MANAGER(manager));
4068
4069 #ifndef ISC_PLATFORM_USETHREADS
4070         if (manager->refs > 1) {
4071                 manager->refs--;
4072                 *managerp = NULL;
4073                 return;
4074         }
4075 #endif /* ISC_PLATFORM_USETHREADS */
4076
4077         LOCK(&manager->lock);
4078
4079 #ifdef ISC_PLATFORM_USETHREADS
4080         /*
4081          * Wait for all sockets to be destroyed.
4082          */
4083         while (!ISC_LIST_EMPTY(manager->socklist)) {
4084                 manager_log(manager, CREATION, "%s",
4085                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
4086                                            ISC_MSG_SOCKETSREMAIN,
4087                                            "sockets exist"));
4088                 WAIT(&manager->shutdown_ok, &manager->lock);
4089         }
4090 #else /* ISC_PLATFORM_USETHREADS */
4091         /*
4092          * Hope all sockets have been destroyed.
4093          */
4094         if (!ISC_LIST_EMPTY(manager->socklist)) {
4095                 manager_log(manager, CREATION, "%s",
4096                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
4097                                            ISC_MSG_SOCKETSREMAIN,
4098                                            "sockets exist"));
4099                 INSIST(0);
4100         }
4101 #endif /* ISC_PLATFORM_USETHREADS */
4102
4103         UNLOCK(&manager->lock);
4104
4105         /*
4106          * Here, poke our select/poll thread.  Do this by closing the write
4107          * half of the pipe, which will send EOF to the read half.
4108          * This is currently a no-op in the non-threaded case.
4109          */
4110         select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
4111
4112 #ifdef ISC_PLATFORM_USETHREADS
4113         /*
4114          * Wait for thread to exit.
4115          */
4116         if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
4117                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4118                                  "isc_thread_join() %s",
4119                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4120                                                 ISC_MSG_FAILED, "failed"));
4121 #endif /* ISC_PLATFORM_USETHREADS */
4122
4123         /*
4124          * Clean up.
4125          */
4126         cleanup_watcher(manager->mctx, manager);
4127
4128 #ifdef ISC_PLATFORM_USETHREADS
4129         (void)close(manager->pipe_fds[0]);
4130         (void)close(manager->pipe_fds[1]);
4131         (void)isc_condition_destroy(&manager->shutdown_ok);
4132 #endif /* ISC_PLATFORM_USETHREADS */
4133
4134         for (i = 0; i < (int)manager->maxsocks; i++)
4135                 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
4136                         (void)close(i);
4137
4138         isc_mem_put(manager->mctx, manager->fds,
4139                     manager->maxsocks * sizeof(isc_socket_t *));
4140         isc_mem_put(manager->mctx, manager->fdstate,
4141                     manager->maxsocks * sizeof(int));
4142
4143         if (manager->stats != NULL)
4144                 isc_stats_detach(&manager->stats);
4145
4146         if (manager->fdlock != NULL) {
4147                 for (i = 0; i < FDLOCK_COUNT; i++)
4148                         DESTROYLOCK(&manager->fdlock[i]);
4149                 isc_mem_put(manager->mctx, manager->fdlock,
4150                             FDLOCK_COUNT * sizeof(isc_mutex_t));
4151         }
4152         DESTROYLOCK(&manager->lock);
4153         manager->magic = 0;
4154         mctx= manager->mctx;
4155         isc_mem_put(mctx, manager, sizeof(*manager));
4156
4157         isc_mem_detach(&mctx);
4158
4159         *managerp = NULL;
4160 }
4161
4162 static isc_result_t
4163 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4164             unsigned int flags)
4165 {
4166         int io_state;
4167         isc_boolean_t have_lock = ISC_FALSE;
4168         isc_task_t *ntask = NULL;
4169         isc_result_t result = ISC_R_SUCCESS;
4170
4171         dev->ev_sender = task;
4172
4173         if (sock->type == isc_sockettype_udp) {
4174                 io_state = doio_recv(sock, dev);
4175         } else {
4176                 LOCK(&sock->lock);
4177                 have_lock = ISC_TRUE;
4178
4179                 if (ISC_LIST_EMPTY(sock->recv_list))
4180                         io_state = doio_recv(sock, dev);
4181                 else
4182                         io_state = DOIO_SOFT;
4183         }
4184
4185         switch (io_state) {
4186         case DOIO_SOFT:
4187                 /*
4188                  * We couldn't read all or part of the request right now, so
4189                  * queue it.
4190                  *
4191                  * Attach to socket and to task
4192                  */
4193                 isc_task_attach(task, &ntask);
4194                 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4195
4196                 if (!have_lock) {
4197                         LOCK(&sock->lock);
4198                         have_lock = ISC_TRUE;
4199                 }
4200
4201                 /*
4202                  * Enqueue the request.  If the socket was previously not being
4203                  * watched, poke the watcher to start paying attention to it.
4204                  */
4205                 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
4206                         select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
4207                 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
4208
4209                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
4210                            "socket_recv: event %p -> task %p",
4211                            dev, ntask);
4212
4213                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4214                         result = ISC_R_INPROGRESS;
4215                 break;
4216
4217         case DOIO_EOF:
4218                 dev->result = ISC_R_EOF;
4219                 /* fallthrough */
4220
4221         case DOIO_HARD:
4222         case DOIO_SUCCESS:
4223                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4224                         send_recvdone_event(sock, &dev);
4225                 break;
4226         }
4227
4228         if (have_lock)
4229                 UNLOCK(&sock->lock);
4230
4231         return (result);
4232 }
4233
4234 isc_result_t
4235 isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
4236                  unsigned int minimum, isc_task_t *task,
4237                  isc_taskaction_t action, const void *arg)
4238 {
4239         isc_socketevent_t *dev;
4240         isc_socketmgr_t *manager;
4241         unsigned int iocount;
4242         isc_buffer_t *buffer;
4243
4244         REQUIRE(VALID_SOCKET(sock));
4245         REQUIRE(buflist != NULL);
4246         REQUIRE(!ISC_LIST_EMPTY(*buflist));
4247         REQUIRE(task != NULL);
4248         REQUIRE(action != NULL);
4249
4250         manager = sock->manager;
4251         REQUIRE(VALID_MANAGER(manager));
4252
4253         iocount = isc_bufferlist_availablecount(buflist);
4254         REQUIRE(iocount > 0);
4255
4256         INSIST(sock->bound);
4257
4258         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4259         if (dev == NULL)
4260                 return (ISC_R_NOMEMORY);
4261
4262         /*
4263          * UDP sockets are always partial read
4264          */
4265         if (sock->type == isc_sockettype_udp)
4266                 dev->minimum = 1;
4267         else {
4268                 if (minimum == 0)
4269                         dev->minimum = iocount;
4270                 else
4271                         dev->minimum = minimum;
4272         }
4273
4274         /*
4275          * Move each buffer from the passed in list to our internal one.
4276          */
4277         buffer = ISC_LIST_HEAD(*buflist);
4278         while (buffer != NULL) {
4279                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
4280                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4281                 buffer = ISC_LIST_HEAD(*buflist);
4282         }
4283
4284         return (socket_recv(sock, dev, task, 0));
4285 }
4286
4287 isc_result_t
4288 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
4289                 isc_task_t *task, isc_taskaction_t action, const void *arg)
4290 {
4291         isc_socketevent_t *dev;
4292         isc_socketmgr_t *manager;
4293
4294         REQUIRE(VALID_SOCKET(sock));
4295         REQUIRE(action != NULL);
4296
4297         manager = sock->manager;
4298         REQUIRE(VALID_MANAGER(manager));
4299
4300         INSIST(sock->bound);
4301
4302         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4303         if (dev == NULL)
4304                 return (ISC_R_NOMEMORY);
4305
4306         return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
4307 }
4308
4309 isc_result_t
4310 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
4311                  unsigned int minimum, isc_task_t *task,
4312                  isc_socketevent_t *event, unsigned int flags)
4313 {
4314         event->ev_sender = sock;
4315         event->result = ISC_R_UNSET;
4316         ISC_LIST_INIT(event->bufferlist);
4317         event->region = *region;
4318         event->n = 0;
4319         event->offset = 0;
4320         event->attributes = 0;
4321
4322         /*
4323          * UDP sockets are always partial read.
4324          */
4325         if (sock->type == isc_sockettype_udp)
4326                 event->minimum = 1;
4327         else {
4328                 if (minimum == 0)
4329                         event->minimum = region->length;
4330                 else
4331                         event->minimum = minimum;
4332         }
4333
4334         return (socket_recv(sock, event, task, flags));
4335 }
4336
4337 static isc_result_t
4338 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4339             isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4340             unsigned int flags)
4341 {
4342         int io_state;
4343         isc_boolean_t have_lock = ISC_FALSE;
4344         isc_task_t *ntask = NULL;
4345         isc_result_t result = ISC_R_SUCCESS;
4346
4347         dev->ev_sender = task;
4348
4349         set_dev_address(address, sock, dev);
4350         if (pktinfo != NULL) {
4351                 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4352                 dev->pktinfo = *pktinfo;
4353
4354                 if (!isc_sockaddr_issitelocal(&dev->address) &&
4355                     !isc_sockaddr_islinklocal(&dev->address)) {
4356                         socket_log(sock, NULL, TRACE, isc_msgcat,
4357                                    ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
4358                                    "pktinfo structure provided, ifindex %u "
4359                                    "(set to 0)", pktinfo->ipi6_ifindex);
4360
4361                         /*
4362                          * Set the pktinfo index to 0 here, to let the
4363                          * kernel decide what interface it should send on.
4364                          */
4365                         dev->pktinfo.ipi6_ifindex = 0;
4366                 }
4367         }
4368
4369         if (sock->type == isc_sockettype_udp)
4370                 io_state = doio_send(sock, dev);
4371         else {
4372                 LOCK(&sock->lock);
4373                 have_lock = ISC_TRUE;
4374
4375                 if (ISC_LIST_EMPTY(sock->send_list))
4376                         io_state = doio_send(sock, dev);
4377                 else
4378                         io_state = DOIO_SOFT;
4379         }
4380
4381         switch (io_state) {
4382         case DOIO_SOFT:
4383                 /*
4384                  * We couldn't send all or part of the request right now, so
4385                  * queue it unless ISC_SOCKFLAG_NORETRY is set.
4386                  */
4387                 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4388                         isc_task_attach(task, &ntask);
4389                         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4390
4391                         if (!have_lock) {
4392                                 LOCK(&sock->lock);
4393                                 have_lock = ISC_TRUE;
4394                         }
4395
4396                         /*
4397                          * Enqueue the request.  If the socket was previously
4398                          * not being watched, poke the watcher to start
4399                          * paying attention to it.
4400                          */
4401                         if (ISC_LIST_EMPTY(sock->send_list) &&
4402                             !sock->pending_send)
4403                                 select_poke(sock->manager, sock->fd,
4404                                             SELECT_POKE_WRITE);
4405                         ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4406
4407                         socket_log(sock, NULL, EVENT, NULL, 0, 0,
4408                                    "socket_send: event %p -> task %p",
4409                                    dev, ntask);
4410
4411                         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4412                                 result = ISC_R_INPROGRESS;
4413                         break;
4414                 }
4415
4416         case DOIO_HARD:
4417         case DOIO_SUCCESS:
4418                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4419                         send_senddone_event(sock, &dev);
4420                 break;
4421         }
4422
4423         if (have_lock)
4424                 UNLOCK(&sock->lock);
4425
4426         return (result);
4427 }
4428
4429 isc_result_t
4430 isc_socket_send(isc_socket_t *sock, isc_region_t *region,
4431                 isc_task_t *task, isc_taskaction_t action, const void *arg)
4432 {
4433         /*
4434          * REQUIRE() checking is performed in isc_socket_sendto().
4435          */
4436         return (isc_socket_sendto(sock, region, task, action, arg, NULL,
4437                                   NULL));
4438 }
4439
4440 isc_result_t
4441 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
4442                   isc_task_t *task, isc_taskaction_t action, const void *arg,
4443                   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4444 {
4445         isc_socketevent_t *dev;
4446         isc_socketmgr_t *manager;
4447
4448         REQUIRE(VALID_SOCKET(sock));
4449         REQUIRE(region != NULL);
4450         REQUIRE(task != NULL);
4451         REQUIRE(action != NULL);
4452
4453         manager = sock->manager;
4454         REQUIRE(VALID_MANAGER(manager));
4455
4456         INSIST(sock->bound);
4457
4458         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4459         if (dev == NULL)
4460                 return (ISC_R_NOMEMORY);
4461
4462         dev->region = *region;
4463
4464         return (socket_send(sock, dev, task, address, pktinfo, 0));
4465 }
4466
4467 isc_result_t
4468 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
4469                  isc_task_t *task, isc_taskaction_t action, const void *arg)
4470 {
4471         return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
4472                                    NULL));
4473 }
4474
4475 isc_result_t
4476 isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
4477                    isc_task_t *task, isc_taskaction_t action, const void *arg,
4478                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4479 {
4480         isc_socketevent_t *dev;
4481         isc_socketmgr_t *manager;
4482         unsigned int iocount;
4483         isc_buffer_t *buffer;
4484
4485         REQUIRE(VALID_SOCKET(sock));
4486         REQUIRE(buflist != NULL);
4487         REQUIRE(!ISC_LIST_EMPTY(*buflist));
4488         REQUIRE(task != NULL);
4489         REQUIRE(action != NULL);
4490
4491         manager = sock->manager;
4492         REQUIRE(VALID_MANAGER(manager));
4493
4494         iocount = isc_bufferlist_usedcount(buflist);
4495         REQUIRE(iocount > 0);
4496
4497         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4498         if (dev == NULL)
4499                 return (ISC_R_NOMEMORY);
4500
4501         /*
4502          * Move each buffer from the passed in list to our internal one.
4503          */
4504         buffer = ISC_LIST_HEAD(*buflist);
4505         while (buffer != NULL) {
4506                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
4507                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4508                 buffer = ISC_LIST_HEAD(*buflist);
4509         }
4510
4511         return (socket_send(sock, dev, task, address, pktinfo, 0));
4512 }
4513
4514 isc_result_t
4515 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
4516                    isc_task_t *task,
4517                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4518                    isc_socketevent_t *event, unsigned int flags)
4519 {
4520         REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
4521         if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
4522                 REQUIRE(sock->type == isc_sockettype_udp);
4523         event->ev_sender = sock;
4524         event->result = ISC_R_UNSET;
4525         ISC_LIST_INIT(event->bufferlist);
4526         event->region = *region;
4527         event->n = 0;
4528         event->offset = 0;
4529         event->attributes = 0;
4530
4531         return (socket_send(sock, event, task, address, pktinfo, flags));
4532 }
4533
4534 void
4535 isc_socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
4536 #ifdef ISC_PLATFORM_HAVESYSUNH
4537         int s;
4538         struct stat sb;
4539         char strbuf[ISC_STRERRORSIZE];
4540
4541         if (sockaddr->type.sa.sa_family != AF_UNIX)
4542                 return;
4543
4544 #ifndef S_ISSOCK
4545 #if defined(S_IFMT) && defined(S_IFSOCK)
4546 #define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
4547 #elif defined(_S_IFMT) && defined(S_IFSOCK)
4548 #define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
4549 #endif
4550 #endif
4551
4552 #ifndef S_ISFIFO
4553 #if defined(S_IFMT) && defined(S_IFIFO)
4554 #define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
4555 #elif defined(_S_IFMT) && defined(S_IFIFO)
4556 #define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
4557 #endif
4558 #endif
4559
4560 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4561 #error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4562 #endif
4563
4564 #ifndef S_ISFIFO
4565 #define S_ISFIFO(mode) 0
4566 #endif
4567
4568 #ifndef S_ISSOCK
4569 #define S_ISSOCK(mode) 0
4570 #endif
4571
4572         if (active) {
4573                 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4574                         isc__strerror(errno, strbuf, sizeof(strbuf));
4575                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4576                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4577                                       "isc_socket_cleanunix: stat(%s): %s",
4578                                       sockaddr->type.sunix.sun_path, strbuf);
4579                         return;
4580                 }
4581                 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4582                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4583                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4584                                       "isc_socket_cleanunix: %s: not a socket",
4585                                       sockaddr->type.sunix.sun_path);
4586                         return;
4587                 }
4588                 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4589                         isc__strerror(errno, strbuf, sizeof(strbuf));
4590                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4591                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4592                                       "isc_socket_cleanunix: unlink(%s): %s",
4593                                       sockaddr->type.sunix.sun_path, strbuf);
4594                 }
4595                 return;
4596         }
4597
4598         s = socket(AF_UNIX, SOCK_STREAM, 0);
4599         if (s < 0) {
4600                 isc__strerror(errno, strbuf, sizeof(strbuf));
4601                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4602                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4603                               "isc_socket_cleanunix: socket(%s): %s",
4604                               sockaddr->type.sunix.sun_path, strbuf);
4605                 return;
4606         }
4607
4608         if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4609                 switch (errno) {
4610                 case ENOENT:    /* We exited cleanly last time */
4611                         break;
4612                 default:
4613                         isc__strerror(errno, strbuf, sizeof(strbuf));
4614                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4615                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4616                                       "isc_socket_cleanunix: stat(%s): %s",
4617                                       sockaddr->type.sunix.sun_path, strbuf);
4618                         break;
4619                 }
4620                 goto cleanup;
4621         }
4622
4623         if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4624                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4625                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4626                               "isc_socket_cleanunix: %s: not a socket",
4627                               sockaddr->type.sunix.sun_path);
4628                 goto cleanup;
4629         }
4630
4631         if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
4632                     sizeof(sockaddr->type.sunix)) < 0) {
4633                 switch (errno) {
4634                 case ECONNREFUSED:
4635                 case ECONNRESET:
4636                         if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4637                                 isc__strerror(errno, strbuf, sizeof(strbuf));
4638                                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4639                                               ISC_LOGMODULE_SOCKET,
4640                                               ISC_LOG_WARNING,
4641                                               "isc_socket_cleanunix: "
4642                                               "unlink(%s): %s",
4643                                               sockaddr->type.sunix.sun_path,
4644                                               strbuf);
4645                         }
4646                         break;
4647                 default:
4648                         isc__strerror(errno, strbuf, sizeof(strbuf));
4649                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4650                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4651                                       "isc_socket_cleanunix: connect(%s): %s",
4652                                       sockaddr->type.sunix.sun_path, strbuf);
4653                         break;
4654                 }
4655         }
4656  cleanup:
4657         close(s);
4658 #else
4659         UNUSED(sockaddr);
4660         UNUSED(active);
4661 #endif
4662 }
4663
4664 isc_result_t
4665 isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
4666                     isc_uint32_t owner, isc_uint32_t group)
4667 {
4668 #ifdef ISC_PLATFORM_HAVESYSUNH
4669         isc_result_t result = ISC_R_SUCCESS;
4670         char strbuf[ISC_STRERRORSIZE];
4671         char path[sizeof(sockaddr->type.sunix.sun_path)];
4672 #ifdef NEED_SECURE_DIRECTORY
4673         char *slash;
4674 #endif
4675
4676         REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4677         INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4678         strcpy(path, sockaddr->type.sunix.sun_path);
4679
4680 #ifdef NEED_SECURE_DIRECTORY
4681         slash = strrchr(path, '/');
4682         if (slash != NULL) {
4683                 if (slash != path)
4684                         *slash = '\0';
4685                 else
4686                         strcpy(path, "/");
4687         } else
4688                 strcpy(path, ".");
4689 #endif
4690
4691         if (chmod(path, perm) < 0) {
4692                 isc__strerror(errno, strbuf, sizeof(strbuf));
4693                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4694                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4695                               "isc_socket_permunix: chmod(%s, %d): %s",
4696                               path, perm, strbuf);
4697                 result = ISC_R_FAILURE;
4698         }
4699         if (chown(path, owner, group) < 0) {
4700                 isc__strerror(errno, strbuf, sizeof(strbuf));
4701                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4702                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4703                               "isc_socket_permunix: chown(%s, %d, %d): %s",
4704                               path, owner, group,
4705                               strbuf);
4706                 result = ISC_R_FAILURE;
4707         }
4708         return (result);
4709 #else
4710         UNUSED(sockaddr);
4711         UNUSED(perm);
4712         UNUSED(owner);
4713         UNUSED(group);
4714         return (ISC_R_NOTIMPLEMENTED);
4715 #endif
4716 }
4717
4718 isc_result_t
4719 isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
4720                 unsigned int options) {
4721         char strbuf[ISC_STRERRORSIZE];
4722         int on = 1;
4723
4724         LOCK(&sock->lock);
4725
4726         INSIST(!sock->bound);
4727
4728         if (sock->pf != sockaddr->type.sa.sa_family) {
4729                 UNLOCK(&sock->lock);
4730                 return (ISC_R_FAMILYMISMATCH);
4731         }
4732         /*
4733          * Only set SO_REUSEADDR when we want a specific port.
4734          */
4735 #ifdef AF_UNIX
4736         if (sock->pf == AF_UNIX)
4737                 goto bind_socket;
4738 #endif
4739         if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4740             isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
4741             setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4742                        sizeof(on)) < 0) {
4743                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4744                                  "setsockopt(%d) %s", sock->fd,
4745                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4746                                                 ISC_MSG_FAILED, "failed"));
4747                 /* Press on... */
4748         }
4749 #ifdef AF_UNIX
4750  bind_socket:
4751 #endif
4752         if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
4753                 inc_stats(sock->manager->stats,
4754                           sock->statsindex[STATID_BINDFAIL]);
4755
4756                 UNLOCK(&sock->lock);
4757                 switch (errno) {
4758                 case EACCES:
4759                         return (ISC_R_NOPERM);
4760                 case EADDRNOTAVAIL:
4761                         return (ISC_R_ADDRNOTAVAIL);
4762                 case EADDRINUSE:
4763                         return (ISC_R_ADDRINUSE);
4764                 case EINVAL:
4765                         return (ISC_R_BOUND);
4766                 default:
4767                         isc__strerror(errno, strbuf, sizeof(strbuf));
4768                         UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
4769                                          strbuf);
4770                         return (ISC_R_UNEXPECTED);
4771                 }
4772         }
4773
4774         socket_log(sock, sockaddr, TRACE,
4775                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
4776         sock->bound = 1;
4777
4778         UNLOCK(&sock->lock);
4779         return (ISC_R_SUCCESS);
4780 }
4781
4782 /*
4783  * Enable this only for specific OS versions, and only when they have repaired
4784  * their problems with it.  Until then, this is is broken and needs to be
4785  * diabled by default.  See RT22589 for details.
4786  */
4787 #undef ENABLE_ACCEPTFILTER
4788
4789 isc_result_t
4790 isc_socket_filter(isc_socket_t *sock, const char *filter) {
4791 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4792         char strbuf[ISC_STRERRORSIZE];
4793         struct accept_filter_arg afa;
4794 #else
4795         UNUSED(sock);
4796         UNUSED(filter);
4797 #endif
4798
4799         REQUIRE(VALID_SOCKET(sock));
4800
4801 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4802         bzero(&afa, sizeof(afa));
4803         strncpy(afa.af_name, filter, sizeof(afa.af_name));
4804         if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
4805                          &afa, sizeof(afa)) == -1) {
4806                 isc__strerror(errno, strbuf, sizeof(strbuf));
4807                 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
4808                            ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
4809                            strbuf);
4810                 return (ISC_R_FAILURE);
4811         }
4812         return (ISC_R_SUCCESS);
4813 #else
4814         return (ISC_R_NOTIMPLEMENTED);
4815 #endif
4816 }
4817
4818 /*
4819  * Set up to listen on a given socket.  We do this by creating an internal
4820  * event that will be dispatched when the socket has read activity.  The
4821  * watcher will send the internal event to the task when there is a new
4822  * connection.
4823  *
4824  * Unlike in read, we don't preallocate a done event here.  Every time there
4825  * is a new connection we'll have to allocate a new one anyway, so we might
4826  * as well keep things simple rather than having to track them.
4827  */
4828 isc_result_t
4829 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
4830         char strbuf[ISC_STRERRORSIZE];
4831
4832         REQUIRE(VALID_SOCKET(sock));
4833
4834         LOCK(&sock->lock);
4835
4836         REQUIRE(!sock->listener);
4837         REQUIRE(sock->bound);
4838         REQUIRE(sock->type == isc_sockettype_tcp ||
4839                 sock->type == isc_sockettype_unix);
4840
4841         if (backlog == 0)
4842                 backlog = SOMAXCONN;
4843
4844         if (listen(sock->fd, (int)backlog) < 0) {
4845                 UNLOCK(&sock->lock);
4846                 isc__strerror(errno, strbuf, sizeof(strbuf));
4847
4848                 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
4849
4850                 return (ISC_R_UNEXPECTED);
4851         }
4852
4853         sock->listener = 1;
4854
4855         UNLOCK(&sock->lock);
4856         return (ISC_R_SUCCESS);
4857 }
4858
4859 /*
4860  * This should try to do aggressive accept() XXXMLG
4861  */
4862 isc_result_t
4863 isc_socket_accept(isc_socket_t *sock,
4864                   isc_task_t *task, isc_taskaction_t action, const void *arg)
4865 {
4866         isc_socket_newconnev_t *dev;
4867         isc_socketmgr_t *manager;
4868         isc_task_t *ntask = NULL;
4869         isc_socket_t *nsock;
4870         isc_result_t result;
4871         isc_boolean_t do_poke = ISC_FALSE;
4872
4873         REQUIRE(VALID_SOCKET(sock));
4874         manager = sock->manager;
4875         REQUIRE(VALID_MANAGER(manager));
4876
4877         LOCK(&sock->lock);
4878
4879         REQUIRE(sock->listener);
4880
4881         /*
4882          * Sender field is overloaded here with the task we will be sending
4883          * this event to.  Just before the actual event is delivered the
4884          * actual ev_sender will be touched up to be the socket.
4885          */
4886         dev = (isc_socket_newconnev_t *)
4887                 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
4888                                    action, arg, sizeof(*dev));
4889         if (dev == NULL) {
4890                 UNLOCK(&sock->lock);
4891                 return (ISC_R_NOMEMORY);
4892         }
4893         ISC_LINK_INIT(dev, ev_link);
4894
4895         result = allocate_socket(manager, sock->type, &nsock);
4896         if (result != ISC_R_SUCCESS) {
4897                 isc_event_free(ISC_EVENT_PTR(&dev));
4898                 UNLOCK(&sock->lock);
4899                 return (result);
4900         }
4901
4902         /*
4903          * Attach to socket and to task.
4904          */
4905         isc_task_attach(task, &ntask);
4906         if (isc_task_exiting(ntask)) {
4907                 free_socket(&nsock);
4908                 isc_task_detach(&ntask);
4909                 isc_event_free(ISC_EVENT_PTR(&dev));
4910                 UNLOCK(&sock->lock);
4911                 return (ISC_R_SHUTTINGDOWN);
4912         }
4913         nsock->references++;
4914         nsock->statsindex = sock->statsindex;
4915
4916         dev->ev_sender = ntask;
4917         dev->newsocket = nsock;
4918
4919         /*
4920          * Poke watcher here.  We still have the socket locked, so there
4921          * is no race condition.  We will keep the lock for such a short
4922          * bit of time waking it up now or later won't matter all that much.
4923          */
4924         if (ISC_LIST_EMPTY(sock->accept_list))
4925                 do_poke = ISC_TRUE;
4926
4927         ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
4928
4929         if (do_poke)
4930                 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
4931
4932         UNLOCK(&sock->lock);
4933         return (ISC_R_SUCCESS);
4934 }
4935
4936 isc_result_t
4937 isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
4938                    isc_task_t *task, isc_taskaction_t action, const void *arg)
4939 {
4940         isc_socket_connev_t *dev;
4941         isc_task_t *ntask = NULL;
4942         isc_socketmgr_t *manager;
4943         int cc;
4944         char strbuf[ISC_STRERRORSIZE];
4945         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
4946
4947         REQUIRE(VALID_SOCKET(sock));
4948         REQUIRE(addr != NULL);
4949         REQUIRE(task != NULL);
4950         REQUIRE(action != NULL);
4951
4952         manager = sock->manager;
4953         REQUIRE(VALID_MANAGER(manager));
4954         REQUIRE(addr != NULL);
4955
4956         if (isc_sockaddr_ismulticast(addr))
4957                 return (ISC_R_MULTICAST);
4958
4959         LOCK(&sock->lock);
4960
4961         REQUIRE(!sock->connecting);
4962
4963         dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
4964                                                         ISC_SOCKEVENT_CONNECT,
4965                                                         action, arg,
4966                                                         sizeof(*dev));
4967         if (dev == NULL) {
4968                 UNLOCK(&sock->lock);
4969                 return (ISC_R_NOMEMORY);
4970         }
4971         ISC_LINK_INIT(dev, ev_link);
4972
4973         /*
4974          * Try to do the connect right away, as there can be only one
4975          * outstanding, and it might happen to complete.
4976          */
4977         sock->peer_address = *addr;
4978         cc = connect(sock->fd, &addr->type.sa, addr->length);
4979         if (cc < 0) {
4980                 /*
4981                  * HP-UX "fails" to connect a UDP socket and sets errno to
4982                  * EINPROGRESS if it's non-blocking.  We'd rather regard this as
4983                  * a success and let the user detect it if it's really an error
4984                  * at the time of sending a packet on the socket.
4985                  */
4986                 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
4987                         cc = 0;
4988                         goto success;
4989                 }
4990                 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
4991                         goto queue;
4992
4993                 switch (errno) {
4994 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
4995                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
4996                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4997                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4998                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4999                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
5000 #ifdef EHOSTDOWN
5001                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
5002 #endif
5003                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
5004                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
5005                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
5006                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
5007                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5008 #undef ERROR_MATCH
5009                 }
5010
5011                 sock->connected = 0;
5012
5013                 isc__strerror(errno, strbuf, sizeof(strbuf));
5014                 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
5015                 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
5016                                  addrbuf, errno, strbuf);
5017
5018                 UNLOCK(&sock->lock);
5019                 inc_stats(sock->manager->stats,
5020                           sock->statsindex[STATID_CONNECTFAIL]);
5021                 isc_event_free(ISC_EVENT_PTR(&dev));
5022                 return (ISC_R_UNEXPECTED);
5023
5024         err_exit:
5025                 sock->connected = 0;
5026                 isc_task_send(task, ISC_EVENT_PTR(&dev));
5027
5028                 UNLOCK(&sock->lock);
5029                 inc_stats(sock->manager->stats,
5030                           sock->statsindex[STATID_CONNECTFAIL]);
5031                 return (ISC_R_SUCCESS);
5032         }
5033
5034         /*
5035          * If connect completed, fire off the done event.
5036          */
5037  success:
5038         if (cc == 0) {
5039                 sock->connected = 1;
5040                 sock->bound = 1;
5041                 dev->result = ISC_R_SUCCESS;
5042                 isc_task_send(task, ISC_EVENT_PTR(&dev));
5043
5044                 UNLOCK(&sock->lock);
5045
5046                 inc_stats(sock->manager->stats,
5047                           sock->statsindex[STATID_CONNECT]);
5048
5049                 return (ISC_R_SUCCESS);
5050         }
5051
5052  queue:
5053
5054         /*
5055          * Attach to task.
5056          */
5057         isc_task_attach(task, &ntask);
5058
5059         sock->connecting = 1;
5060
5061         dev->ev_sender = ntask;
5062
5063         /*
5064          * Poke watcher here.  We still have the socket locked, so there
5065          * is no race condition.  We will keep the lock for such a short
5066          * bit of time waking it up now or later won't matter all that much.
5067          */
5068         if (sock->connect_ev == NULL)
5069                 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
5070
5071         sock->connect_ev = dev;
5072
5073         UNLOCK(&sock->lock);
5074         return (ISC_R_SUCCESS);
5075 }
5076
5077 /*
5078  * Called when a socket with a pending connect() finishes.
5079  */
5080 static void
5081 internal_connect(isc_task_t *me, isc_event_t *ev) {
5082         isc_socket_t *sock;
5083         isc_socket_connev_t *dev;
5084         isc_task_t *task;
5085         int cc;
5086         ISC_SOCKADDR_LEN_T optlen;
5087         char strbuf[ISC_STRERRORSIZE];
5088         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5089
5090         UNUSED(me);
5091         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
5092
5093         sock = ev->ev_sender;
5094         INSIST(VALID_SOCKET(sock));
5095
5096         LOCK(&sock->lock);
5097
5098         /*
5099          * When the internal event was sent the reference count was bumped
5100          * to keep the socket around for us.  Decrement the count here.
5101          */
5102         INSIST(sock->references > 0);
5103         sock->references--;
5104         if (sock->references == 0) {
5105                 UNLOCK(&sock->lock);
5106                 destroy(&sock);
5107                 return;
5108         }
5109
5110         /*
5111          * Has this event been canceled?
5112          */
5113         dev = sock->connect_ev;
5114         if (dev == NULL) {
5115                 INSIST(!sock->connecting);
5116                 UNLOCK(&sock->lock);
5117                 return;
5118         }
5119
5120         INSIST(sock->connecting);
5121         sock->connecting = 0;
5122
5123         /*
5124          * Get any possible error status here.
5125          */
5126         optlen = sizeof(cc);
5127         if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
5128                        (void *)&cc, (void *)&optlen) < 0)
5129                 cc = errno;
5130         else
5131                 errno = cc;
5132
5133         if (errno != 0) {
5134                 /*
5135                  * If the error is EAGAIN, just re-select on this
5136                  * fd and pretend nothing strange happened.
5137                  */
5138                 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
5139                         sock->connecting = 1;
5140                         select_poke(sock->manager, sock->fd,
5141                                     SELECT_POKE_CONNECT);
5142                         UNLOCK(&sock->lock);
5143
5144                         return;
5145                 }
5146
5147                 inc_stats(sock->manager->stats,
5148                           sock->statsindex[STATID_CONNECTFAIL]);
5149
5150                 /*
5151                  * Translate other errors into ISC_R_* flavors.
5152                  */
5153                 switch (errno) {
5154 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
5155                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
5156                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
5157                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
5158                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
5159                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
5160 #ifdef EHOSTDOWN
5161                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
5162 #endif
5163                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
5164                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
5165                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
5166                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
5167                         ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
5168                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5169 #undef ERROR_MATCH
5170                 default:
5171                         dev->result = ISC_R_UNEXPECTED;
5172                         isc_sockaddr_format(&sock->peer_address, peerbuf,
5173                                             sizeof(peerbuf));
5174                         isc__strerror(errno, strbuf, sizeof(strbuf));
5175                         UNEXPECTED_ERROR(__FILE__, __LINE__,
5176                                          "internal_connect: connect(%s) %s",
5177                                          peerbuf, strbuf);
5178                 }
5179         } else {
5180                 inc_stats(sock->manager->stats,
5181                           sock->statsindex[STATID_CONNECT]);
5182                 dev->result = ISC_R_SUCCESS;
5183                 sock->connected = 1;
5184                 sock->bound = 1;
5185         }
5186
5187         sock->connect_ev = NULL;
5188
5189         UNLOCK(&sock->lock);
5190
5191         task = dev->ev_sender;
5192         dev->ev_sender = sock;
5193         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
5194 }
5195
5196 isc_result_t
5197 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5198         isc_result_t result;
5199
5200         REQUIRE(VALID_SOCKET(sock));
5201         REQUIRE(addressp != NULL);
5202
5203         LOCK(&sock->lock);
5204
5205         if (sock->connected) {
5206                 *addressp = sock->peer_address;
5207                 result = ISC_R_SUCCESS;
5208         } else {
5209                 result = ISC_R_NOTCONNECTED;
5210         }
5211
5212         UNLOCK(&sock->lock);
5213
5214         return (result);
5215 }
5216
5217 isc_result_t
5218 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5219         ISC_SOCKADDR_LEN_T len;
5220         isc_result_t result;
5221         char strbuf[ISC_STRERRORSIZE];
5222
5223         REQUIRE(VALID_SOCKET(sock));
5224         REQUIRE(addressp != NULL);
5225
5226         LOCK(&sock->lock);
5227
5228         if (!sock->bound) {
5229                 result = ISC_R_NOTBOUND;
5230                 goto out;
5231         }
5232
5233         result = ISC_R_SUCCESS;
5234
5235         len = sizeof(addressp->type);
5236         if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
5237                 isc__strerror(errno, strbuf, sizeof(strbuf));
5238                 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
5239                                  strbuf);
5240                 result = ISC_R_UNEXPECTED;
5241                 goto out;
5242         }
5243         addressp->length = (unsigned int)len;
5244
5245  out:
5246         UNLOCK(&sock->lock);
5247
5248         return (result);
5249 }
5250
5251 /*
5252  * Run through the list of events on this socket, and cancel the ones
5253  * queued for task "task" of type "how".  "how" is a bitmask.
5254  */
5255 void
5256 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
5257
5258         REQUIRE(VALID_SOCKET(sock));
5259
5260         /*
5261          * Quick exit if there is nothing to do.  Don't even bother locking
5262          * in this case.
5263          */
5264         if (how == 0)
5265                 return;
5266
5267         LOCK(&sock->lock);
5268
5269         /*
5270          * All of these do the same thing, more or less.
5271          * Each will:
5272          *      o If the internal event is marked as "posted" try to
5273          *        remove it from the task's queue.  If this fails, mark it
5274          *        as canceled instead, and let the task clean it up later.
5275          *      o For each I/O request for that task of that type, post
5276          *        its done event with status of "ISC_R_CANCELED".
5277          *      o Reset any state needed.
5278          */
5279         if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
5280             && !ISC_LIST_EMPTY(sock->recv_list)) {
5281                 isc_socketevent_t      *dev;
5282                 isc_socketevent_t      *next;
5283                 isc_task_t             *current_task;
5284
5285                 dev = ISC_LIST_HEAD(sock->recv_list);
5286
5287                 while (dev != NULL) {
5288                         current_task = dev->ev_sender;
5289                         next = ISC_LIST_NEXT(dev, ev_link);
5290
5291                         if ((task == NULL) || (task == current_task)) {
5292                                 dev->result = ISC_R_CANCELED;
5293                                 send_recvdone_event(sock, &dev);
5294                         }
5295                         dev = next;
5296                 }
5297         }
5298
5299         if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
5300             && !ISC_LIST_EMPTY(sock->send_list)) {
5301                 isc_socketevent_t      *dev;
5302                 isc_socketevent_t      *next;
5303                 isc_task_t             *current_task;
5304
5305                 dev = ISC_LIST_HEAD(sock->send_list);
5306
5307                 while (dev != NULL) {
5308                         current_task = dev->ev_sender;
5309                         next = ISC_LIST_NEXT(dev, ev_link);
5310
5311                         if ((task == NULL) || (task == current_task)) {
5312                                 dev->result = ISC_R_CANCELED;
5313                                 send_senddone_event(sock, &dev);
5314                         }
5315                         dev = next;
5316                 }
5317         }
5318
5319         if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
5320             && !ISC_LIST_EMPTY(sock->accept_list)) {
5321                 isc_socket_newconnev_t *dev;
5322                 isc_socket_newconnev_t *next;
5323                 isc_task_t             *current_task;
5324
5325                 dev = ISC_LIST_HEAD(sock->accept_list);
5326                 while (dev != NULL) {
5327                         current_task = dev->ev_sender;
5328                         next = ISC_LIST_NEXT(dev, ev_link);
5329
5330                         if ((task == NULL) || (task == current_task)) {
5331
5332                                 ISC_LIST_UNLINK(sock->accept_list, dev,
5333                                                 ev_link);
5334
5335                                 dev->newsocket->references--;
5336                                 free_socket(&dev->newsocket);
5337
5338                                 dev->result = ISC_R_CANCELED;
5339                                 dev->ev_sender = sock;
5340                                 isc_task_sendanddetach(&current_task,
5341                                                        ISC_EVENT_PTR(&dev));
5342                         }
5343
5344                         dev = next;
5345                 }
5346         }
5347
5348         /*
5349          * Connecting is not a list.
5350          */
5351         if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
5352             && sock->connect_ev != NULL) {
5353                 isc_socket_connev_t    *dev;
5354                 isc_task_t             *current_task;
5355
5356                 INSIST(sock->connecting);
5357                 sock->connecting = 0;
5358
5359                 dev = sock->connect_ev;
5360                 current_task = dev->ev_sender;
5361
5362                 if ((task == NULL) || (task == current_task)) {
5363                         sock->connect_ev = NULL;
5364
5365                         dev->result = ISC_R_CANCELED;
5366                         dev->ev_sender = sock;
5367                         isc_task_sendanddetach(&current_task,
5368                                                ISC_EVENT_PTR(&dev));
5369                 }
5370         }
5371
5372         UNLOCK(&sock->lock);
5373 }
5374
5375 isc_sockettype_t
5376 isc_socket_gettype(isc_socket_t *sock) {
5377         REQUIRE(VALID_SOCKET(sock));
5378
5379         return (sock->type);
5380 }
5381
5382 isc_boolean_t
5383 isc_socket_isbound(isc_socket_t *sock) {
5384         isc_boolean_t val;
5385
5386         LOCK(&sock->lock);
5387         val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
5388         UNLOCK(&sock->lock);
5389
5390         return (val);
5391 }
5392
5393 void
5394 isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
5395 #if defined(IPV6_V6ONLY)
5396         int onoff = yes ? 1 : 0;
5397 #else
5398         UNUSED(yes);
5399         UNUSED(sock);
5400 #endif
5401
5402         REQUIRE(VALID_SOCKET(sock));
5403
5404 #ifdef IPV6_V6ONLY
5405         if (sock->pf == AF_INET6) {
5406                 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5407                                (void *)&onoff, sizeof(int)) < 0) {
5408                         char strbuf[ISC_STRERRORSIZE];
5409                         isc__strerror(errno, strbuf, sizeof(strbuf));
5410                         UNEXPECTED_ERROR(__FILE__, __LINE__,
5411                                          "setsockopt(%d, IPV6_V6ONLY) "
5412                                          "%s: %s", sock->fd,
5413                                          isc_msgcat_get(isc_msgcat,
5414                                                         ISC_MSGSET_GENERAL,
5415                                                         ISC_MSG_FAILED,
5416                                                         "failed"),
5417                                          strbuf);
5418                 }
5419         }
5420         FIX_IPV6_RECVPKTINFO(sock);     /* AIX */
5421 #endif
5422 }
5423
5424 #ifndef ISC_PLATFORM_USETHREADS
5425 /* In our assumed scenario, we can simply use a single static object. */
5426 static isc_socketwait_t swait_private;
5427
5428 int
5429 isc__socketmgr_waitevents(struct timeval *tvp, isc_socketwait_t **swaitp) {
5430         int n;
5431 #ifdef USE_KQUEUE
5432         struct timespec ts, *tsp;
5433 #endif
5434 #ifdef USE_EPOLL
5435         int timeout;
5436 #endif
5437 #ifdef USE_DEVPOLL
5438         struct dvpoll dvp;
5439 #endif
5440
5441         REQUIRE(swaitp != NULL && *swaitp == NULL);
5442
5443         if (socketmgr == NULL)
5444                 return (0);
5445
5446 #ifdef USE_KQUEUE
5447         if (tvp != NULL) {
5448                 ts.tv_sec = tvp->tv_sec;
5449                 ts.tv_nsec = tvp->tv_usec * 1000;
5450                 tsp = &ts;
5451         } else
5452                 tsp = NULL;
5453         swait_private.nevents = kevent(socketmgr->kqueue_fd, NULL, 0,
5454                                        socketmgr->events, socketmgr->nevents,
5455                                        tsp);
5456         n = swait_private.nevents;
5457 #elif defined(USE_EPOLL)
5458         if (tvp != NULL)
5459                 timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000;
5460         else
5461                 timeout = -1;
5462         swait_private.nevents = epoll_wait(socketmgr->epoll_fd,
5463                                            socketmgr->events,
5464                                            socketmgr->nevents, timeout);
5465         n = swait_private.nevents;
5466 #elif defined(USE_DEVPOLL)
5467         dvp.dp_fds = socketmgr->events;
5468         dvp.dp_nfds = socketmgr->nevents;
5469         if (tvp != NULL) {
5470                 dvp.dp_timeout = tvp->tv_sec * 1000 +
5471                         (tvp->tv_usec + 999) / 1000;
5472         } else
5473                 dvp.dp_timeout = -1;
5474         swait_private.nevents = ioctl(socketmgr->devpoll_fd, DP_POLL, &dvp);
5475         n = swait_private.nevents;
5476 #elif defined(USE_SELECT)
5477         memcpy(socketmgr->read_fds_copy, socketmgr->read_fds,
5478                socketmgr->fd_bufsize);
5479         memcpy(socketmgr->write_fds_copy, socketmgr->write_fds,
5480                socketmgr->fd_bufsize);
5481
5482         swait_private.readset = socketmgr->read_fds_copy;
5483         swait_private.writeset = socketmgr->write_fds_copy;
5484         swait_private.maxfd = socketmgr->maxfd + 1;
5485
5486         n = select(swait_private.maxfd, swait_private.readset,
5487                    swait_private.writeset, NULL, tvp);
5488 #endif
5489
5490         *swaitp = &swait_private;
5491         return (n);
5492 }
5493
5494 isc_result_t
5495 isc__socketmgr_dispatch(isc_socketwait_t *swait) {
5496         REQUIRE(swait == &swait_private);
5497
5498         if (socketmgr == NULL)
5499                 return (ISC_R_NOTFOUND);
5500
5501 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
5502         (void)process_fds(socketmgr, socketmgr->events, swait->nevents);
5503         return (ISC_R_SUCCESS);
5504 #elif defined(USE_SELECT)
5505         process_fds(socketmgr, swait->maxfd, swait->readset, swait->writeset);
5506         return (ISC_R_SUCCESS);
5507 #endif
5508 }
5509 #endif /* ISC_PLATFORM_USETHREADS */
5510
5511 void
5512 isc_socket_setname(isc_socket_t *socket, const char *name, void *tag) {
5513
5514         /*
5515          * Name 'socket'.
5516          */
5517
5518         REQUIRE(VALID_SOCKET(socket));
5519
5520         LOCK(&socket->lock);
5521         memset(socket->name, 0, sizeof(socket->name));
5522         strncpy(socket->name, name, sizeof(socket->name) - 1);
5523         socket->tag = tag;
5524         UNLOCK(&socket->lock);
5525 }
5526
5527 const char *
5528 isc_socket_getname(isc_socket_t *socket) {
5529         return (socket->name);
5530 }
5531
5532 void *
5533 isc_socket_gettag(isc_socket_t *socket) {
5534         return (socket->tag);
5535 }
5536
5537 #ifdef HAVE_LIBXML2
5538
5539 static const char *
5540 _socktype(isc_sockettype_t type)
5541 {
5542         if (type == isc_sockettype_udp)
5543                 return ("udp");
5544         else if (type == isc_sockettype_tcp)
5545                 return ("tcp");
5546         else if (type == isc_sockettype_unix)
5547                 return ("unix");
5548         else if (type == isc_sockettype_fdwatch)
5549                 return ("fdwatch");
5550         else
5551                 return ("not-initialized");
5552 }
5553
5554 void
5555 isc_socketmgr_renderxml(isc_socketmgr_t *mgr, xmlTextWriterPtr writer)
5556 {
5557         isc_socket_t *sock;
5558         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5559         isc_sockaddr_t addr;
5560         ISC_SOCKADDR_LEN_T len;
5561
5562         LOCK(&mgr->lock);
5563
5564 #ifndef ISC_PLATFORM_USETHREADS
5565         xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5566         xmlTextWriterWriteFormatString(writer, "%d", mgr->refs);
5567         xmlTextWriterEndElement(writer);
5568 #endif
5569
5570         xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets");
5571         sock = ISC_LIST_HEAD(mgr->socklist);
5572         while (sock != NULL) {
5573                 LOCK(&sock->lock);
5574                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket");
5575
5576                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "id");
5577                 xmlTextWriterWriteFormatString(writer, "%p", sock);
5578                 xmlTextWriterEndElement(writer);
5579
5580                 if (sock->name[0] != 0) {
5581                         xmlTextWriterStartElement(writer, ISC_XMLCHAR "name");
5582                         xmlTextWriterWriteFormatString(writer, "%s",
5583                                                        sock->name);
5584                         xmlTextWriterEndElement(writer); /* name */
5585                 }
5586
5587                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5588                 xmlTextWriterWriteFormatString(writer, "%d", sock->references);
5589                 xmlTextWriterEndElement(writer);
5590
5591                 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
5592                                           ISC_XMLCHAR _socktype(sock->type));
5593
5594                 if (sock->connected) {
5595                         isc_sockaddr_format(&sock->peer_address, peerbuf,
5596                                             sizeof(peerbuf));
5597                         xmlTextWriterWriteElement(writer,
5598                                                   ISC_XMLCHAR "peer-address",
5599                                                   ISC_XMLCHAR peerbuf);
5600                 }
5601
5602                 len = sizeof(addr);
5603                 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5604                         isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5605                         xmlTextWriterWriteElement(writer,
5606                                                   ISC_XMLCHAR "local-address",
5607                                                   ISC_XMLCHAR peerbuf);
5608                 }
5609
5610                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "states");
5611                 if (sock->pending_recv)
5612                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5613                                                 ISC_XMLCHAR "pending-receive");
5614                 if (sock->pending_send)
5615                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5616                                                   ISC_XMLCHAR "pending-send");
5617                 if (sock->pending_accept)
5618                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5619                                                  ISC_XMLCHAR "pending_accept");
5620                 if (sock->listener)
5621                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5622                                                   ISC_XMLCHAR "listener");
5623                 if (sock->connected)
5624                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5625                                                   ISC_XMLCHAR "connected");
5626                 if (sock->connecting)
5627                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5628                                                   ISC_XMLCHAR "connecting");
5629                 if (sock->bound)
5630                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5631                                                   ISC_XMLCHAR "bound");
5632
5633                 xmlTextWriterEndElement(writer); /* states */
5634
5635                 xmlTextWriterEndElement(writer); /* socket */
5636
5637                 UNLOCK(&sock->lock);
5638                 sock = ISC_LIST_NEXT(sock, link);
5639         }
5640         xmlTextWriterEndElement(writer); /* sockets */
5641
5642         UNLOCK(&mgr->lock);
5643 }
5644 #endif /* HAVE_LIBXML2 */