]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/bind9/lib/isc/unix/socket.c
Merge from vendor/bind9/dist as of the 9.4.3 import
[FreeBSD/FreeBSD.git] / contrib / bind9 / lib / isc / unix / socket.c
1 /*
2  * Copyright (C) 2004-2008  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1998-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: socket.c,v 1.237.18.56 2008/11/12 03:58:36 marka Exp $ */
19
20 /*! \file */
21
22 #include <config.h>
23
24 #include <sys/param.h>
25 #include <sys/types.h>
26 #include <sys/socket.h>
27 #include <sys/stat.h>
28 #include <sys/time.h>
29 #include <sys/uio.h>
30
31 #include <errno.h>
32 #include <fcntl.h>
33 #include <stddef.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37
38 #include <isc/buffer.h>
39 #include <isc/bufferlist.h>
40 #include <isc/condition.h>
41 #include <isc/formatcheck.h>
42 #include <isc/list.h>
43 #include <isc/log.h>
44 #include <isc/mem.h>
45 #include <isc/msgs.h>
46 #include <isc/mutex.h>
47 #include <isc/net.h>
48 #include <isc/once.h>
49 #include <isc/platform.h>
50 #include <isc/print.h>
51 #include <isc/region.h>
52 #include <isc/socket.h>
53 #include <isc/strerror.h>
54 #include <isc/task.h>
55 #include <isc/thread.h>
56 #include <isc/util.h>
57
58 #ifdef ISC_PLATFORM_HAVESYSUNH
59 #include <sys/un.h>
60 #endif
61 #ifdef ISC_PLATFORM_HAVEKQUEUE
62 #include <sys/event.h>
63 #endif
64 #ifdef ISC_PLATFORM_HAVEEPOLL
65 #include <sys/epoll.h>
66 #endif
67 #ifdef ISC_PLATFORM_HAVEDEVPOLL
68 #include <sys/devpoll.h>
69 #endif
70
71 #include "errno2result.h"
72
73 #ifndef ISC_PLATFORM_USETHREADS
74 #include "socket_p.h"
75 #endif /* ISC_PLATFORM_USETHREADS */
76
77 /*%
78  * Choose the most preferable multiplex method.
79  */
80 #ifdef ISC_PLATFORM_HAVEKQUEUE
81 #define USE_KQUEUE
82 #elif defined (ISC_PLATFORM_HAVEEPOLL)
83 #define USE_EPOLL
84 #elif defined (ISC_PLATFORM_HAVEDEVPOLL)
85 #define USE_DEVPOLL
86 typedef struct {
87         unsigned int want_read : 1,
88                 want_write : 1;
89 } pollinfo_t;
90 #else
91 #define USE_SELECT
92 #endif  /* ISC_PLATFORM_HAVEKQUEUE */
93
94 #ifndef ISC_PLATFORM_USETHREADS
95 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
96 struct isc_socketwait {
97         int nevents;
98 };
99 #elif defined (USE_SELECT)
100 struct isc_socketwait {
101         fd_set *readset;
102         fd_set *writeset;
103         int nfds;
104         int maxfd;
105 };
106 #endif  /* USE_KQUEUE */
107 #endif /* !ISC_PLATFORM_USETHREADS */
108
109 /*%
110  * Maximum number of allowable open sockets.  This is also the maximum
111  * allowable socket file descriptor.
112  *
113  * Care should be taken before modifying this value for select():
114  * The API standard doesn't ensure select() accept more than (the system default
115  * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
116  * the vast majority of cases.  This constant should therefore be increased only
117  * when absolutely necessary and possible, i.e., the server is exhausting all
118  * available file descriptors (up to FD_SETSIZE) and the select() function
119  * and FD_xxx macros support larger values than FD_SETSIZE (which may not
120  * always by true, but we keep using some of them to ensure as much
121  * portability as possible).  Note also that overall server performance
122  * may be rather worsened with a larger value of this constant due to
123  * inherent scalability problems of select().
124  *
125  * As a special note, this value shouldn't have to be touched if
126  * this is a build for an authoritative only DNS server.
127  */
128 #ifndef ISC_SOCKET_MAXSOCKETS
129 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
130 #define ISC_SOCKET_MAXSOCKETS 4096
131 #elif defined(USE_SELECT)
132 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
133 #endif  /* USE_KQUEUE... */
134 #endif  /* ISC_SOCKET_MAXSOCKETS */
135
136 #ifdef USE_SELECT
137 /*%
138  * Mac OS X needs a special definition to support larger values in select().
139  * We always define this because a larger value can be specified run-time.
140  */
141 #ifdef __APPLE__
142 #define _DARWIN_UNLIMITED_SELECT
143 #endif  /* __APPLE__ */
144 #endif  /* USE_SELECT */
145
146 #ifdef ISC_SOCKET_USE_POLLWATCH
147 /*%
148  * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
149  * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
150  * some of the specified FD.  The idea is based on the observation that it's
151  * likely for a busy server to keep receiving packets.  It specifically works
152  * as follows: the socket watcher is first initialized with the state of
153  * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
154  * event occurs.  When it wakes up for a socket I/O event, it moves to the
155  * poll_active state, and sets the poll timeout to a short period
156  * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
157  * watcher goes to the poll_checking state with the same timeout period.
158  * In this state, the watcher tries to detect whether this is a break
159  * during intermittent events or the kernel bug is triggered.  If the next
160  * polling reports an event within the short period, the previous timeout is
161  * likely to be a kernel bug, and so the watcher goes back to the active state.
162  * Otherwise, it moves to the idle state again.
163  *
164  * It's not clear whether this is a thread-related bug, but since we've only
165  * seen this with threads, this workaround is used only when enabling threads.
166  */
167
168 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
169
170 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
171 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
172 #endif  /* ISC_SOCKET_POLLWATCH_TIMEOUT */
173 #endif  /* ISC_SOCKET_USE_POLLWATCH */
174
175 /*%
176  * Size of per-FD lock buckets.
177  */
178 #ifdef ISC_PLATFORM_USETHREADS
179 #define FDLOCK_COUNT            1024
180 #define FDLOCK_ID(fd)           ((fd) % FDLOCK_COUNT)
181 #else
182 #define FDLOCK_COUNT            1
183 #define FDLOCK_ID(fd)           0
184 #endif  /* ISC_PLATFORM_USETHREADS */
185
186 /*%
187  * Maximum number of events communicated with the kernel.  There should normally
188  * be no need for having a large number.
189  */
190 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
191 #ifndef ISC_SOCKET_MAXEVENTS
192 #define ISC_SOCKET_MAXEVENTS    64
193 #endif
194 #endif
195
196 /*%
197  * Some systems define the socket length argument as an int, some as size_t,
198  * some as socklen_t.  This is here so it can be easily changed if needed.
199  */
200 #ifndef ISC_SOCKADDR_LEN_T
201 #define ISC_SOCKADDR_LEN_T unsigned int
202 #endif
203
204
205 #if defined(SO_BSDCOMPAT) && defined(__linux__)
206 #include <sys/utsname.h>
207 #endif
208
209 /*%
210  * Define what the possible "soft" errors can be.  These are non-fatal returns
211  * of various network related functions, like recv() and so on.
212  *
213  * For some reason, BSDI (and perhaps others) will sometimes return <0
214  * from recv() but will have errno==0.  This is broken, but we have to
215  * work around it here.
216  */
217 #define SOFT_ERROR(e)   ((e) == EAGAIN || \
218                          (e) == EWOULDBLOCK || \
219                          (e) == EINTR || \
220                          (e) == 0)
221
222 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
223
224 /*!<
225  * DLVL(90)  --  Function entry/exit and other tracing.
226  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
227  * DLVL(60)  --  Socket data send/receive
228  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
229  * DLVL(20)  --  Socket creation/destruction.
230  */
231 #define TRACE_LEVEL             90
232 #define CORRECTNESS_LEVEL       70
233 #define IOEVENT_LEVEL           60
234 #define EVENT_LEVEL             50
235 #define CREATION_LEVEL          20
236
237 #define TRACE           DLVL(TRACE_LEVEL)
238 #define CORRECTNESS     DLVL(CORRECTNESS_LEVEL)
239 #define IOEVENT         DLVL(IOEVENT_LEVEL)
240 #define EVENT           DLVL(EVENT_LEVEL)
241 #define CREATION        DLVL(CREATION_LEVEL)
242
243 typedef isc_event_t intev_t;
244
245 #define SOCKET_MAGIC            ISC_MAGIC('I', 'O', 'i', 'o')
246 #define VALID_SOCKET(t)         ISC_MAGIC_VALID(t, SOCKET_MAGIC)
247
248 /*!
249  * IPv6 control information.  If the socket is an IPv6 socket we want
250  * to collect the destination address and interface so the client can
251  * set them on outgoing packets.
252  */
253 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
254 #ifndef USE_CMSG
255 #define USE_CMSG        1
256 #endif
257 #endif
258
259 /*%
260  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
261  * a setsockopt() like interface to request timestamps, and if the OS
262  * doesn't do it for us, call gettimeofday() on every UDP receive?
263  */
264 #ifdef SO_TIMESTAMP
265 #ifndef USE_CMSG
266 #define USE_CMSG        1
267 #endif
268 #endif
269
270 /*%
271  * The size to raise the recieve buffer to (from BIND 8).
272  */
273 #define RCVBUFSIZE (32*1024)
274
275 /*%
276  * The number of times a send operation is repeated if the result is EINTR.
277  */
278 #define NRETRIES 10
279
280 struct isc_socket {
281         /* Not locked. */
282         unsigned int            magic;
283         isc_socketmgr_t        *manager;
284         isc_mutex_t             lock;
285         isc_sockettype_t        type;
286
287         /* Locked by socket lock. */
288         ISC_LINK(isc_socket_t)  link;
289         unsigned int            references;
290         int                     fd;
291         int                     pf;
292
293         ISC_LIST(isc_socketevent_t)             send_list;
294         ISC_LIST(isc_socketevent_t)             recv_list;
295         ISC_LIST(isc_socket_newconnev_t)        accept_list;
296         isc_socket_connev_t                    *connect_ev;
297
298         /*
299          * Internal events.  Posted when a descriptor is readable or
300          * writable.  These are statically allocated and never freed.
301          * They will be set to non-purgable before use.
302          */
303         intev_t                 readable_ev;
304         intev_t                 writable_ev;
305
306         isc_sockaddr_t          address;  /* remote address */
307
308         unsigned int            pending_recv : 1,
309                                 pending_send : 1,
310                                 pending_accept : 1,
311                                 listener : 1, /* listener socket */
312                                 connected : 1,
313                                 connecting : 1, /* connect pending */
314                                 bound : 1; /* bound to local addr */
315
316 #ifdef ISC_NET_RECVOVERFLOW
317         unsigned char           overflow; /* used for MSG_TRUNC fake */
318 #endif
319
320         char                    *recvcmsgbuf;
321         ISC_SOCKADDR_LEN_T      recvcmsgbuflen;
322         char                    *sendcmsgbuf;
323         ISC_SOCKADDR_LEN_T      sendcmsgbuflen;
324 };
325
326 #define SOCKET_MANAGER_MAGIC    ISC_MAGIC('I', 'O', 'm', 'g')
327 #define VALID_MANAGER(m)        ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
328
329 struct isc_socketmgr {
330         /* Not locked. */
331         unsigned int            magic;
332         isc_mem_t              *mctx;
333         isc_mutex_t             lock;
334         isc_mutex_t             *fdlock;
335 #ifdef USE_KQUEUE
336         int                     kqueue_fd;
337         int                     nevents;
338         struct kevent           *events;
339 #endif  /* USE_KQUEUE */
340 #ifdef USE_EPOLL
341         int                     epoll_fd;
342         int                     nevents;
343         struct epoll_event      *events;
344 #endif  /* USE_EPOLL */
345 #ifdef USE_DEVPOLL
346         int                     devpoll_fd;
347         int                     nevents;
348         struct pollfd           *events;
349 #endif  /* USE_DEVPOLL */
350 #ifdef USE_SELECT
351         int                     fd_bufsize;
352 #endif  /* USE_SELECT */
353         unsigned int            maxsocks;
354 #ifdef ISC_PLATFORM_USETHREADS
355         int                     pipe_fds[2];
356 #endif
357
358         /* Locked by fdlock. */
359         isc_socket_t           **fds;
360         int                     *fdstate;
361 #ifdef USE_DEVPOLL
362         pollinfo_t              *fdpollinfo;
363 #endif
364
365         /* Locked by manager lock. */
366         ISC_LIST(isc_socket_t)  socklist;
367 #ifdef USE_SELECT
368         fd_set                  *read_fds;
369         fd_set                  *read_fds_copy;
370         fd_set                  *write_fds;
371         fd_set                  *write_fds_copy;
372         int                     maxfd;
373 #endif  /* USE_SELECT */
374         int                     reserved;       /* unlocked */
375 #ifdef ISC_PLATFORM_USETHREADS
376         isc_thread_t            watcher;
377         isc_condition_t         shutdown_ok;
378 #else /* ISC_PLATFORM_USETHREADS */
379         unsigned int            refs;
380 #endif /* ISC_PLATFORM_USETHREADS */
381 };
382
383 #ifndef ISC_PLATFORM_USETHREADS
384 static isc_socketmgr_t *socketmgr = NULL;
385 #endif /* ISC_PLATFORM_USETHREADS */
386
387 #define CLOSED          0       /* this one must be zero */
388 #define MANAGED         1
389 #define CLOSE_PENDING   2
390
391 /*
392  * send() and recv() iovec counts
393  */
394 #define MAXSCATTERGATHER_SEND   (ISC_SOCKET_MAXSCATTERGATHER)
395 #ifdef ISC_NET_RECVOVERFLOW
396 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER + 1)
397 #else
398 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER)
399 #endif
400
401 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
402 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
403 static void free_socket(isc_socket_t **);
404 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
405                                     isc_socket_t **);
406 static void destroy(isc_socket_t **);
407 static void internal_accept(isc_task_t *, isc_event_t *);
408 static void internal_connect(isc_task_t *, isc_event_t *);
409 static void internal_recv(isc_task_t *, isc_event_t *);
410 static void internal_send(isc_task_t *, isc_event_t *);
411 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
412 static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
413                               struct msghdr *, struct iovec *, size_t *);
414 static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
415                               struct msghdr *, struct iovec *, size_t *);
416 #ifdef ISC_PLATFORM_USETHREADS
417 static isc_boolean_t process_ctlfd(isc_socketmgr_t *manager);
418 #endif
419
420 #define SELECT_POKE_SHUTDOWN            (-1)
421 #define SELECT_POKE_NOTHING             (-2)
422 #define SELECT_POKE_READ                (-3)
423 #define SELECT_POKE_ACCEPT              (-3) /*%< Same as _READ */
424 #define SELECT_POKE_WRITE               (-4)
425 #define SELECT_POKE_CONNECT             (-4) /*%< Same as _WRITE */
426 #define SELECT_POKE_CLOSE               (-5)
427
428 #define SOCK_DEAD(s)                    ((s)->references == 0)
429
430 static void
431 manager_log(isc_socketmgr_t *sockmgr,
432             isc_logcategory_t *category, isc_logmodule_t *module, int level,
433             const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
434 static void
435 manager_log(isc_socketmgr_t *sockmgr,
436             isc_logcategory_t *category, isc_logmodule_t *module, int level,
437             const char *fmt, ...)
438 {
439         char msgbuf[2048];
440         va_list ap;
441
442         if (! isc_log_wouldlog(isc_lctx, level))
443                 return;
444
445         va_start(ap, fmt);
446         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
447         va_end(ap);
448
449         isc_log_write(isc_lctx, category, module, level,
450                       "sockmgr %p: %s", sockmgr, msgbuf);
451 }
452
453 static void
454 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
455            isc_logcategory_t *category, isc_logmodule_t *module, int level,
456            isc_msgcat_t *msgcat, int msgset, int message,
457            const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
458 static void
459 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
460            isc_logcategory_t *category, isc_logmodule_t *module, int level,
461            isc_msgcat_t *msgcat, int msgset, int message,
462            const char *fmt, ...)
463 {
464         char msgbuf[2048];
465         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
466         va_list ap;
467
468         if (! isc_log_wouldlog(isc_lctx, level))
469                 return;
470
471         va_start(ap, fmt);
472         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
473         va_end(ap);
474
475         if (address == NULL) {
476                 isc_log_iwrite(isc_lctx, category, module, level,
477                                msgcat, msgset, message,
478                                "socket %p: %s", sock, msgbuf);
479         } else {
480                 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
481                 isc_log_iwrite(isc_lctx, category, module, level,
482                                msgcat, msgset, message,
483                                "socket %p %s: %s", sock, peerbuf, msgbuf);
484         }
485 }
486
487 #if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
488     defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
489 /*
490  * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
491  * setting IPV6_V6ONLY.
492  */
493 static void
494 FIX_IPV6_RECVPKTINFO(isc_socket_t *sock)
495 {
496         char strbuf[ISC_STRERRORSIZE];
497         int on = 1;
498
499         if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
500                 return;
501
502         if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
503                        (void *)&on, sizeof(on)) < 0) {
504         
505                 UNEXPECTED_ERROR(__FILE__, __LINE__,
506                                  "setsockopt(%d, IPV6_RECVPKTINFO) "
507                                  "%s: %s", sock->fd,
508                                  isc_msgcat_get(isc_msgcat,
509                                                 ISC_MSGSET_GENERAL,
510                                                 ISC_MSG_FAILED,
511                                                 "failed"),
512                                  strbuf);
513         }
514 }
515 #else
516 #define FIX_IPV6_RECVPKTINFO(sock) (void)0
517 #endif
518
519 static inline isc_result_t
520 watch_fd(isc_socketmgr_t *manager, int fd, int msg) {
521         isc_result_t result = ISC_R_SUCCESS;
522
523 #ifdef USE_KQUEUE
524         struct kevent evchange;
525
526         memset(&evchange, 0, sizeof(evchange));
527         if (msg == SELECT_POKE_READ)
528                 evchange.filter = EVFILT_READ;
529         else
530                 evchange.filter = EVFILT_WRITE;
531         evchange.flags = EV_ADD;
532         evchange.ident = fd;
533         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
534                 result = isc__errno2result(errno);
535
536         return (result);
537 #elif defined(USE_EPOLL)
538         struct epoll_event event;
539
540         if (msg == SELECT_POKE_READ)
541                 event.events = EPOLLIN;
542         else
543                 event.events = EPOLLOUT;
544         event.data.fd = fd;
545         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
546             errno != EEXIST) {
547                 result = isc__errno2result(errno);
548         }
549
550         return (result);
551 #elif defined(USE_DEVPOLL)
552         struct pollfd pfd;
553         int lockid = FDLOCK_ID(fd);
554
555         memset(&pfd, 0, sizeof(pfd));
556         if (msg == SELECT_POKE_READ)
557                 pfd.events = POLLIN;
558         else
559                 pfd.events = POLLOUT;
560         pfd.fd = fd;
561         pfd.revents = 0;
562         LOCK(&manager->fdlock[lockid]);
563         if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
564                 result = isc__errno2result(errno);
565         else {
566                 if (msg == SELECT_POKE_READ)
567                         manager->fdpollinfo[fd].want_read = 1;
568                 else
569                         manager->fdpollinfo[fd].want_write = 1;
570         }
571         UNLOCK(&manager->fdlock[lockid]);
572
573         return (result);
574 #elif defined(USE_SELECT)
575         LOCK(&manager->lock);
576         if (msg == SELECT_POKE_READ)
577                 FD_SET(fd, manager->read_fds);
578         if (msg == SELECT_POKE_WRITE)
579                 FD_SET(fd, manager->write_fds);
580         UNLOCK(&manager->lock);
581
582         return (result);
583 #endif
584 }
585
586 static inline isc_result_t
587 unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) {
588         isc_result_t result = ISC_R_SUCCESS;
589
590 #ifdef USE_KQUEUE
591         struct kevent evchange;
592
593         memset(&evchange, 0, sizeof(evchange));
594         if (msg == SELECT_POKE_READ)
595                 evchange.filter = EVFILT_READ;
596         else
597                 evchange.filter = EVFILT_WRITE;
598         evchange.flags = EV_DELETE;
599         evchange.ident = fd;
600         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
601                 result = isc__errno2result(errno);
602
603         return (result);
604 #elif defined(USE_EPOLL)
605         struct epoll_event event;
606
607         if (msg == SELECT_POKE_READ)
608                 event.events = EPOLLIN;
609         else
610                 event.events = EPOLLOUT;
611         event.data.fd = fd;
612         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
613             errno != ENOENT) {
614                 char strbuf[ISC_STRERRORSIZE];
615                 isc__strerror(errno, strbuf, sizeof(strbuf));
616                 UNEXPECTED_ERROR(__FILE__, __LINE__,
617                                  "epoll_ctl(DEL), %d: %s", fd, strbuf);
618                 result = ISC_R_UNEXPECTED;
619         }
620         return (result);
621 #elif defined(USE_DEVPOLL)
622         struct pollfd pfds[2];
623         size_t writelen = sizeof(pfds[0]);
624         int lockid = FDLOCK_ID(fd);
625
626         memset(pfds, 0, sizeof(pfds));
627         pfds[0].events = POLLREMOVE;
628         pfds[0].fd = fd;
629
630         /*
631          * Canceling read or write polling via /dev/poll is tricky.  Since it
632          * only provides a way of canceling per FD, we may need to re-poll the
633          * socket for the other operation.
634          */
635         LOCK(&manager->fdlock[lockid]);
636         if (msg == SELECT_POKE_READ &&
637             manager->fdpollinfo[fd].want_write == 1) {
638                 pfds[1].events = POLLOUT;
639                 pfds[1].fd = fd;
640                 writelen += sizeof(pfds[1]);
641         }
642         if (msg == SELECT_POKE_WRITE &&
643             manager->fdpollinfo[fd].want_read == 1) {
644                 pfds[1].events = POLLIN;
645                 pfds[1].fd = fd;
646                 writelen += sizeof(pfds[1]);
647         }
648
649         if (write(manager->devpoll_fd, pfds, writelen) == -1)
650                 result = isc__errno2result(errno);
651         else {
652                 if (msg == SELECT_POKE_READ)
653                         manager->fdpollinfo[fd].want_read = 0;
654                 else
655                         manager->fdpollinfo[fd].want_write = 0;
656         }
657         UNLOCK(&manager->fdlock[lockid]);
658
659         return (result);
660 #elif defined(USE_SELECT)
661         LOCK(&manager->lock);
662         if (msg == SELECT_POKE_READ)
663                 FD_CLR(fd, manager->read_fds);
664         else if (msg == SELECT_POKE_WRITE)
665                 FD_CLR(fd, manager->write_fds);
666         UNLOCK(&manager->lock);
667
668         return (result);
669 #endif
670 }
671
672 static void
673 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
674         isc_result_t result;
675         int lockid = FDLOCK_ID(fd);
676
677         /*
678          * This is a wakeup on a socket.  If the socket is not in the
679          * process of being closed, start watching it for either reads
680          * or writes.
681          */
682
683         INSIST(fd >= 0 && fd < (int)manager->maxsocks);
684
685         if (msg == SELECT_POKE_CLOSE) {
686                 /* No one should be updating fdstate, so no need to lock it */
687                 INSIST(manager->fdstate[fd] == CLOSE_PENDING);
688                 manager->fdstate[fd] = CLOSED;
689                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
690                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
691                 (void)close(fd);
692                 return;
693         }
694
695         LOCK(&manager->fdlock[lockid]);
696         if (manager->fdstate[fd] == CLOSE_PENDING) {
697                 UNLOCK(&manager->fdlock[lockid]);
698                 /*
699                  * We accept (and ignore) any error from unwatch_fd() as we are
700                  * closing the socket, hoping it doesn't leave dangling state in
701                  * the kernel.
702                  * Note that unwatch_fd() must be called after releasing the
703                  * fdlock; otherwise it could cause deadlock due to a lock order
704                  * reversal.
705                  */
706                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
707                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
708                 return;
709         }
710         if (manager->fdstate[fd] != MANAGED) {
711                 UNLOCK(&manager->fdlock[lockid]);
712                 return;
713         }
714         UNLOCK(&manager->fdlock[lockid]);
715
716         /*
717          * Set requested bit.
718          */
719         result = watch_fd(manager, fd, msg);
720         if (result != ISC_R_SUCCESS) {
721                 /*
722                  * XXXJT: what should we do?  Ignoring the failure of watching
723                  * a socket will make the application dysfunctional, but there
724                  * seems to be no reasonable recovery process.
725                  */
726                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
727                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
728                               "failed to start watching FD (%d): %s",
729                               fd, isc_result_totext(result));
730         }
731 }
732
733 #ifdef ISC_PLATFORM_USETHREADS
734 /*
735  * Poke the select loop when there is something for us to do.
736  * The write is required (by POSIX) to complete.  That is, we
737  * will not get partial writes.
738  */
739 static void
740 select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
741         int cc;
742         int buf[2];
743         char strbuf[ISC_STRERRORSIZE];
744
745         buf[0] = fd;
746         buf[1] = msg;
747
748         do {
749                 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
750 #ifdef ENOSR
751                 /*
752                  * Treat ENOSR as EAGAIN but loop slowly as it is
753                  * unlikely to clear fast.
754                  */
755                 if (cc < 0 && errno == ENOSR) {
756                         sleep(1);
757                         errno = EAGAIN;
758                 }
759 #endif
760         } while (cc < 0 && SOFT_ERROR(errno));
761
762         if (cc < 0) {
763                 isc__strerror(errno, strbuf, sizeof(strbuf));
764                 FATAL_ERROR(__FILE__, __LINE__,
765                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
766                                            ISC_MSG_WRITEFAILED,
767                                            "write() failed "
768                                            "during watcher poke: %s"),
769                             strbuf);
770         }
771
772         INSIST(cc == sizeof(buf));
773 }
774
775 /*
776  * Read a message on the internal fd.
777  */
778 static void
779 select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
780         int buf[2];
781         int cc;
782         char strbuf[ISC_STRERRORSIZE];
783
784         cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
785         if (cc < 0) {
786                 *msg = SELECT_POKE_NOTHING;
787                 *fd = -1;       /* Silence compiler. */
788                 if (SOFT_ERROR(errno))
789                         return;
790
791                 isc__strerror(errno, strbuf, sizeof(strbuf));
792                 FATAL_ERROR(__FILE__, __LINE__,
793                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
794                                            ISC_MSG_READFAILED,
795                                            "read() failed "
796                                            "during watcher poke: %s"),
797                             strbuf);
798
799                 return;
800         }
801         INSIST(cc == sizeof(buf));
802
803         *fd = buf[0];
804         *msg = buf[1];
805 }
806 #else /* ISC_PLATFORM_USETHREADS */
807 /*
808  * Update the state of the socketmgr when something changes.
809  */
810 static void
811 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
812         if (msg == SELECT_POKE_SHUTDOWN)
813                 return;
814         else if (fd >= 0)
815                 wakeup_socket(manager, fd, msg);
816         return;
817 }
818 #endif /* ISC_PLATFORM_USETHREADS */
819
820 /*
821  * Make a fd non-blocking.
822  */
823 static isc_result_t
824 make_nonblock(int fd) {
825         int ret;
826         int flags;
827         char strbuf[ISC_STRERRORSIZE];
828 #ifdef USE_FIONBIO_IOCTL
829         int on = 1;
830
831         ret = ioctl(fd, FIONBIO, (char *)&on);
832 #else
833         flags = fcntl(fd, F_GETFL, 0);
834         flags |= PORT_NONBLOCK;
835         ret = fcntl(fd, F_SETFL, flags);
836 #endif
837
838         if (ret == -1) {
839                 isc__strerror(errno, strbuf, sizeof(strbuf));
840                 UNEXPECTED_ERROR(__FILE__, __LINE__,
841 #ifdef USE_FIONBIO_IOCTL
842                                  "ioctl(%d, FIONBIO, &on): %s", fd,
843 #else
844                                  "fcntl(%d, F_SETFL, %d): %s", fd, flags,
845 #endif
846                                  strbuf);
847
848                 return (ISC_R_UNEXPECTED);
849         }
850
851         return (ISC_R_SUCCESS);
852 }
853
854 #ifdef USE_CMSG
855 /*
856  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
857  * In order to ensure as much portability as possible, we provide wrapper
858  * functions of these macros.
859  * Note that cmsg_space() could run slow on OSes that do not have
860  * CMSG_SPACE.
861  */
862 static inline ISC_SOCKADDR_LEN_T
863 cmsg_len(ISC_SOCKADDR_LEN_T len) {
864 #ifdef CMSG_LEN
865         return (CMSG_LEN(len));
866 #else
867         ISC_SOCKADDR_LEN_T hdrlen;
868
869         /*
870          * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
871          * is correct.
872          */
873         hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
874         return (hdrlen + len);
875 #endif
876 }
877
878 static inline ISC_SOCKADDR_LEN_T
879 cmsg_space(ISC_SOCKADDR_LEN_T len) {
880 #ifdef CMSG_SPACE
881         return (CMSG_SPACE(len));
882 #else
883         struct msghdr msg;
884         struct cmsghdr *cmsgp;
885         /*
886          * XXX: The buffer length is an ad-hoc value, but should be enough
887          * in a practical sense.
888          */
889         char dummybuf[sizeof(struct cmsghdr) + 1024];
890
891         memset(&msg, 0, sizeof(msg));
892         msg.msg_control = dummybuf;
893         msg.msg_controllen = sizeof(dummybuf);
894
895         cmsgp = (struct cmsghdr *)dummybuf;
896         cmsgp->cmsg_len = cmsg_len(len);
897
898         cmsgp = CMSG_NXTHDR(&msg, cmsgp);
899         if (cmsgp != NULL)
900                 return ((char *)cmsgp - (char *)msg.msg_control);
901         else
902                 return (0);
903 #endif
904 }
905 #endif /* USE_CMSG */
906
907 /*
908  * Process control messages received on a socket.
909  */
910 static void
911 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
912 #ifdef USE_CMSG
913         struct cmsghdr *cmsgp;
914 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
915         struct in6_pktinfo *pktinfop;
916 #endif
917 #ifdef SO_TIMESTAMP
918         struct timeval *timevalp;
919 #endif
920 #endif
921
922         /*
923          * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
924          * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
925          * They are all here, outside of the CPP tests, because it is
926          * more consistent with the usual ISC coding style.
927          */
928         UNUSED(sock);
929         UNUSED(msg);
930         UNUSED(dev);
931
932 #ifdef ISC_NET_BSD44MSGHDR
933
934 #ifdef MSG_TRUNC
935         if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
936                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
937 #endif
938
939 #ifdef MSG_CTRUNC
940         if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
941                 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
942 #endif
943
944 #ifndef USE_CMSG
945         return;
946 #else
947         if (msg->msg_controllen == 0U || msg->msg_control == NULL)
948                 return;
949
950 #ifdef SO_TIMESTAMP
951         timevalp = NULL;
952 #endif
953 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
954         pktinfop = NULL;
955 #endif
956
957         cmsgp = CMSG_FIRSTHDR(msg);
958         while (cmsgp != NULL) {
959                 socket_log(sock, NULL, TRACE,
960                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
961                            "processing cmsg %p", cmsgp);
962
963 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
964                 if (cmsgp->cmsg_level == IPPROTO_IPV6
965                     && cmsgp->cmsg_type == IPV6_PKTINFO) {
966
967                         pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
968                         memcpy(&dev->pktinfo, pktinfop,
969                                sizeof(struct in6_pktinfo));
970                         dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
971                         socket_log(sock, NULL, TRACE,
972                                    isc_msgcat, ISC_MSGSET_SOCKET,
973                                    ISC_MSG_IFRECEIVED,
974                                    "interface received on ifindex %u",
975                                    dev->pktinfo.ipi6_ifindex);
976                         if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
977                                 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
978                         goto next;
979                 }
980 #endif
981
982 #ifdef SO_TIMESTAMP
983                 if (cmsgp->cmsg_level == SOL_SOCKET
984                     && cmsgp->cmsg_type == SCM_TIMESTAMP) {
985                         timevalp = (struct timeval *)CMSG_DATA(cmsgp);
986                         dev->timestamp.seconds = timevalp->tv_sec;
987                         dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
988                         dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
989                         goto next;
990                 }
991 #endif
992
993         next:
994                 cmsgp = CMSG_NXTHDR(msg, cmsgp);
995         }
996 #endif /* USE_CMSG */
997
998 #endif /* ISC_NET_BSD44MSGHDR */
999 }
1000
1001 /*
1002  * Construct an iov array and attach it to the msghdr passed in.  This is
1003  * the SEND constructor, which will use the used region of the buffer
1004  * (if using a buffer list) or will use the internal region (if a single
1005  * buffer I/O is requested).
1006  *
1007  * Nothing can be NULL, and the done event must list at least one buffer
1008  * on the buffer linked list for this function to be meaningful.
1009  *
1010  * If write_countp != NULL, *write_countp will hold the number of bytes
1011  * this transaction can send.
1012  */
1013 static void
1014 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
1015                   struct msghdr *msg, struct iovec *iov, size_t *write_countp)
1016 {
1017         unsigned int iovcount;
1018         isc_buffer_t *buffer;
1019         isc_region_t used;
1020         size_t write_count;
1021         size_t skip_count;
1022
1023         memset(msg, 0, sizeof(*msg));
1024
1025         if (!sock->connected) {
1026                 msg->msg_name = (void *)&dev->address.type.sa;
1027                 msg->msg_namelen = dev->address.length;
1028         } else {
1029                 msg->msg_name = NULL;
1030                 msg->msg_namelen = 0;
1031         }
1032
1033         buffer = ISC_LIST_HEAD(dev->bufferlist);
1034         write_count = 0;
1035         iovcount = 0;
1036
1037         /*
1038          * Single buffer I/O?  Skip what we've done so far in this region.
1039          */
1040         if (buffer == NULL) {
1041                 write_count = dev->region.length - dev->n;
1042                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1043                 iov[0].iov_len = write_count;
1044                 iovcount = 1;
1045
1046                 goto config;
1047         }
1048
1049         /*
1050          * Multibuffer I/O.
1051          * Skip the data in the buffer list that we have already written.
1052          */
1053         skip_count = dev->n;
1054         while (buffer != NULL) {
1055                 REQUIRE(ISC_BUFFER_VALID(buffer));
1056                 if (skip_count < isc_buffer_usedlength(buffer))
1057                         break;
1058                 skip_count -= isc_buffer_usedlength(buffer);
1059                 buffer = ISC_LIST_NEXT(buffer, link);
1060         }
1061
1062         while (buffer != NULL) {
1063                 INSIST(iovcount < MAXSCATTERGATHER_SEND);
1064
1065                 isc_buffer_usedregion(buffer, &used);
1066
1067                 if (used.length > 0) {
1068                         iov[iovcount].iov_base = (void *)(used.base
1069                                                           + skip_count);
1070                         iov[iovcount].iov_len = used.length - skip_count;
1071                         write_count += (used.length - skip_count);
1072                         skip_count = 0;
1073                         iovcount++;
1074                 }
1075                 buffer = ISC_LIST_NEXT(buffer, link);
1076         }
1077
1078         INSIST(skip_count == 0U);
1079
1080  config:
1081         msg->msg_iov = iov;
1082         msg->msg_iovlen = iovcount;
1083
1084 #ifdef ISC_NET_BSD44MSGHDR
1085         msg->msg_control = NULL;
1086         msg->msg_controllen = 0;
1087         msg->msg_flags = 0;
1088 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1089         if ((sock->type == isc_sockettype_udp)
1090             && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
1091                 struct cmsghdr *cmsgp;
1092                 struct in6_pktinfo *pktinfop;
1093
1094                 socket_log(sock, NULL, TRACE,
1095                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
1096                            "sendto pktinfo data, ifindex %u",
1097                            dev->pktinfo.ipi6_ifindex);
1098
1099                 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1100                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1101                 msg->msg_control = (void *)sock->sendcmsgbuf;
1102
1103                 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
1104                 cmsgp->cmsg_level = IPPROTO_IPV6;
1105                 cmsgp->cmsg_type = IPV6_PKTINFO;
1106                 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1107                 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1108                 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1109         }
1110 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
1111 #else /* ISC_NET_BSD44MSGHDR */
1112         msg->msg_accrights = NULL;
1113         msg->msg_accrightslen = 0;
1114 #endif /* ISC_NET_BSD44MSGHDR */
1115
1116         if (write_countp != NULL)
1117                 *write_countp = write_count;
1118 }
1119
1120 /*
1121  * Construct an iov array and attach it to the msghdr passed in.  This is
1122  * the RECV constructor, which will use the avialable region of the buffer
1123  * (if using a buffer list) or will use the internal region (if a single
1124  * buffer I/O is requested).
1125  *
1126  * Nothing can be NULL, and the done event must list at least one buffer
1127  * on the buffer linked list for this function to be meaningful.
1128  *
1129  * If read_countp != NULL, *read_countp will hold the number of bytes
1130  * this transaction can receive.
1131  */
1132 static void
1133 build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
1134                   struct msghdr *msg, struct iovec *iov, size_t *read_countp)
1135 {
1136         unsigned int iovcount;
1137         isc_buffer_t *buffer;
1138         isc_region_t available;
1139         size_t read_count;
1140
1141         memset(msg, 0, sizeof(struct msghdr));
1142
1143         if (sock->type == isc_sockettype_udp) {
1144                 memset(&dev->address, 0, sizeof(dev->address));
1145 #ifdef BROKEN_RECVMSG
1146                 if (sock->pf == AF_INET) {
1147                         msg->msg_name = (void *)&dev->address.type.sin;
1148                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1149                 } else if (sock->pf == AF_INET6) {
1150                         msg->msg_name = (void *)&dev->address.type.sin6;
1151                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1152 #ifdef ISC_PLATFORM_HAVESYSUNH
1153                 } else if (sock->pf == AF_UNIX) {
1154                         msg->msg_name = (void *)&dev->address.type.sunix;
1155                         msg->msg_namelen = sizeof(dev->address.type.sunix);
1156 #endif
1157                 } else {
1158                         msg->msg_name = (void *)&dev->address.type.sa;
1159                         msg->msg_namelen = sizeof(dev->address.type);
1160                 }
1161 #else
1162                 msg->msg_name = (void *)&dev->address.type.sa;
1163                 msg->msg_namelen = sizeof(dev->address.type);
1164 #endif
1165 #ifdef ISC_NET_RECVOVERFLOW
1166                 /* If needed, steal one iovec for overflow detection. */
1167                 maxiov--;
1168 #endif
1169         } else { /* TCP */
1170                 msg->msg_name = NULL;
1171                 msg->msg_namelen = 0;
1172                 dev->address = sock->address;
1173         }
1174
1175         buffer = ISC_LIST_HEAD(dev->bufferlist);
1176         read_count = 0;
1177
1178         /*
1179          * Single buffer I/O?  Skip what we've done so far in this region.
1180          */
1181         if (buffer == NULL) {
1182                 read_count = dev->region.length - dev->n;
1183                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1184                 iov[0].iov_len = read_count;
1185                 iovcount = 1;
1186
1187                 goto config;
1188         }
1189
1190         /*
1191          * Multibuffer I/O.
1192          * Skip empty buffers.
1193          */
1194         while (buffer != NULL) {
1195                 REQUIRE(ISC_BUFFER_VALID(buffer));
1196                 if (isc_buffer_availablelength(buffer) != 0)
1197                         break;
1198                 buffer = ISC_LIST_NEXT(buffer, link);
1199         }
1200
1201         iovcount = 0;
1202         while (buffer != NULL) {
1203                 INSIST(iovcount < MAXSCATTERGATHER_RECV);
1204
1205                 isc_buffer_availableregion(buffer, &available);
1206
1207                 if (available.length > 0) {
1208                         iov[iovcount].iov_base = (void *)(available.base);
1209                         iov[iovcount].iov_len = available.length;
1210                         read_count += available.length;
1211                         iovcount++;
1212                 }
1213                 buffer = ISC_LIST_NEXT(buffer, link);
1214         }
1215
1216  config:
1217
1218         /*
1219          * If needed, set up to receive that one extra byte.  Note that
1220          * we know there is at least one iov left, since we stole it
1221          * at the top of this function.
1222          */
1223 #ifdef ISC_NET_RECVOVERFLOW
1224         if (sock->type == isc_sockettype_udp) {
1225                 iov[iovcount].iov_base = (void *)(&sock->overflow);
1226                 iov[iovcount].iov_len = 1;
1227                 iovcount++;
1228         }
1229 #endif
1230
1231         msg->msg_iov = iov;
1232         msg->msg_iovlen = iovcount;
1233
1234 #ifdef ISC_NET_BSD44MSGHDR
1235         msg->msg_control = NULL;
1236         msg->msg_controllen = 0;
1237         msg->msg_flags = 0;
1238 #if defined(USE_CMSG)
1239         if (sock->type == isc_sockettype_udp) {
1240                 msg->msg_control = sock->recvcmsgbuf;
1241                 msg->msg_controllen = sock->recvcmsgbuflen;
1242         }
1243 #endif /* USE_CMSG */
1244 #else /* ISC_NET_BSD44MSGHDR */
1245         msg->msg_accrights = NULL;
1246         msg->msg_accrightslen = 0;
1247 #endif /* ISC_NET_BSD44MSGHDR */
1248
1249         if (read_countp != NULL)
1250                 *read_countp = read_count;
1251 }
1252
1253 static void
1254 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1255                 isc_socketevent_t *dev)
1256 {
1257         if (sock->type == isc_sockettype_udp) {
1258                 if (address != NULL)
1259                         dev->address = *address;
1260                 else
1261                         dev->address = sock->address;
1262         } else if (sock->type == isc_sockettype_tcp) {
1263                 INSIST(address == NULL);
1264                 dev->address = sock->address;
1265         }
1266 }
1267
1268 static void
1269 destroy_socketevent(isc_event_t *event) {
1270         isc_socketevent_t *ev = (isc_socketevent_t *)event;
1271
1272         INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1273
1274         (ev->destroy)(event);
1275 }
1276
1277 static isc_socketevent_t *
1278 allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1279                      isc_taskaction_t action, const void *arg)
1280 {
1281         isc_socketevent_t *ev;
1282
1283         ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1284                                                      sock, eventtype,
1285                                                      action, arg,
1286                                                      sizeof(*ev));
1287
1288         if (ev == NULL)
1289                 return (NULL);
1290
1291         ev->result = ISC_R_UNEXPECTED;
1292         ISC_LINK_INIT(ev, ev_link);
1293         ISC_LIST_INIT(ev->bufferlist);
1294         ev->region.base = NULL;
1295         ev->n = 0;
1296         ev->offset = 0;
1297         ev->attributes = 0;
1298         ev->destroy = ev->ev_destroy;
1299         ev->ev_destroy = destroy_socketevent;
1300
1301         return (ev);
1302 }
1303
1304 #if defined(ISC_SOCKET_DEBUG)
1305 static void
1306 dump_msg(struct msghdr *msg) {
1307         unsigned int i;
1308
1309         printf("MSGHDR %p\n", msg);
1310         printf("\tname %p, namelen %ld\n", msg->msg_name,
1311                (long) msg->msg_namelen);
1312         printf("\tiov %p, iovlen %ld\n", msg->msg_iov,
1313                (long) msg->msg_iovlen);
1314         for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1315                 printf("\t\t%d\tbase %p, len %ld\n", i,
1316                        msg->msg_iov[i].iov_base,
1317                        (long) msg->msg_iov[i].iov_len);
1318 #ifdef ISC_NET_BSD44MSGHDR
1319         printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1320                (long) msg->msg_controllen);
1321 #endif
1322 }
1323 #endif
1324
1325 #define DOIO_SUCCESS            0       /* i/o ok, event sent */
1326 #define DOIO_SOFT               1       /* i/o ok, soft error, no event sent */
1327 #define DOIO_HARD               2       /* i/o error, event sent */
1328 #define DOIO_EOF                3       /* EOF, no event sent */
1329
1330 static int
1331 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1332         int cc;
1333         struct iovec iov[MAXSCATTERGATHER_RECV];
1334         size_t read_count;
1335         size_t actual_count;
1336         struct msghdr msghdr;
1337         isc_buffer_t *buffer;
1338         int recv_errno;
1339         char strbuf[ISC_STRERRORSIZE];
1340
1341         build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
1342
1343 #if defined(ISC_SOCKET_DEBUG)
1344         dump_msg(&msghdr);
1345 #endif
1346
1347         cc = recvmsg(sock->fd, &msghdr, 0);
1348         recv_errno = errno;
1349
1350 #if defined(ISC_SOCKET_DEBUG)
1351         dump_msg(&msghdr);
1352 #endif
1353
1354         if (cc < 0) {
1355                 if (SOFT_ERROR(recv_errno))
1356                         return (DOIO_SOFT);
1357
1358                 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1359                         isc__strerror(recv_errno, strbuf, sizeof(strbuf));
1360                         socket_log(sock, NULL, IOEVENT,
1361                                    isc_msgcat, ISC_MSGSET_SOCKET,
1362                                    ISC_MSG_DOIORECV,
1363                                   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1364                                    sock->fd, cc, recv_errno, strbuf);
1365                 }
1366
1367 #define SOFT_OR_HARD(_system, _isc) \
1368         if (recv_errno == _system) { \
1369                 if (sock->connected) { \
1370                         dev->result = _isc; \
1371                         return (DOIO_HARD); \
1372                 } \
1373                 return (DOIO_SOFT); \
1374         }
1375 #define ALWAYS_HARD(_system, _isc) \
1376         if (recv_errno == _system) { \
1377                 dev->result = _isc; \
1378                 return (DOIO_HARD); \
1379         }
1380
1381                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1382                 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1383                 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1384                 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1385                 /* HPUX 11.11 can return EADDRNOTAVAIL. */
1386                 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1387                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1388                 /*
1389                  * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
1390                  * errors.
1391                  */
1392 #ifdef EPROTO
1393                 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1394 #endif
1395                 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1396
1397 #undef SOFT_OR_HARD
1398 #undef ALWAYS_HARD
1399
1400                 dev->result = isc__errno2result(recv_errno);
1401                 return (DOIO_HARD);
1402         }
1403
1404         /*
1405          * On TCP, zero length reads indicate EOF, while on
1406          * UDP, zero length reads are perfectly valid, although
1407          * strange.
1408          */
1409         if ((sock->type == isc_sockettype_tcp) && (cc == 0))
1410                 return (DOIO_EOF);
1411
1412         if (sock->type == isc_sockettype_udp) {
1413                 dev->address.length = msghdr.msg_namelen;
1414                 if (isc_sockaddr_getport(&dev->address) == 0) {
1415                         if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1416                                 socket_log(sock, &dev->address, IOEVENT,
1417                                            isc_msgcat, ISC_MSGSET_SOCKET,
1418                                            ISC_MSG_ZEROPORT,
1419                                            "dropping source port zero packet");
1420                         }
1421                         return (DOIO_SOFT);
1422                 }
1423         }
1424
1425         socket_log(sock, &dev->address, IOEVENT,
1426                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1427                    "packet received correctly");
1428
1429         /*
1430          * Overflow bit detection.  If we received MORE bytes than we should,
1431          * this indicates an overflow situation.  Set the flag in the
1432          * dev entry and adjust how much we read by one.
1433          */
1434 #ifdef ISC_NET_RECVOVERFLOW
1435         if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1436                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1437                 cc--;
1438         }
1439 #endif
1440
1441         /*
1442          * If there are control messages attached, run through them and pull
1443          * out the interesting bits.
1444          */
1445         if (sock->type == isc_sockettype_udp)
1446                 process_cmsg(sock, &msghdr, dev);
1447
1448         /*
1449          * update the buffers (if any) and the i/o count
1450          */
1451         dev->n += cc;
1452         actual_count = cc;
1453         buffer = ISC_LIST_HEAD(dev->bufferlist);
1454         while (buffer != NULL && actual_count > 0U) {
1455                 REQUIRE(ISC_BUFFER_VALID(buffer));
1456                 if (isc_buffer_availablelength(buffer) <= actual_count) {
1457                         actual_count -= isc_buffer_availablelength(buffer);
1458                         isc_buffer_add(buffer,
1459                                        isc_buffer_availablelength(buffer));
1460                 } else {
1461                         isc_buffer_add(buffer, actual_count);
1462                         actual_count = 0;
1463                         break;
1464                 }
1465                 buffer = ISC_LIST_NEXT(buffer, link);
1466                 if (buffer == NULL) {
1467                         INSIST(actual_count == 0U);
1468                 }
1469         }
1470
1471         /*
1472          * If we read less than we expected, update counters,
1473          * and let the upper layer poke the descriptor.
1474          */
1475         if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1476                 return (DOIO_SOFT);
1477
1478         /*
1479          * Full reads are posted, or partials if partials are ok.
1480          */
1481         dev->result = ISC_R_SUCCESS;
1482         return (DOIO_SUCCESS);
1483 }
1484
1485 /*
1486  * Returns:
1487  *      DOIO_SUCCESS    The operation succeeded.  dev->result contains
1488  *                      ISC_R_SUCCESS.
1489  *
1490  *      DOIO_HARD       A hard or unexpected I/O error was encountered.
1491  *                      dev->result contains the appropriate error.
1492  *
1493  *      DOIO_SOFT       A soft I/O error was encountered.  No senddone
1494  *                      event was sent.  The operation should be retried.
1495  *
1496  *      No other return values are possible.
1497  */
1498 static int
1499 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1500         int cc;
1501         struct iovec iov[MAXSCATTERGATHER_SEND];
1502         size_t write_count;
1503         struct msghdr msghdr;
1504         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1505         int attempts = 0;
1506         int send_errno;
1507         char strbuf[ISC_STRERRORSIZE];
1508
1509         build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1510
1511  resend:
1512         cc = sendmsg(sock->fd, &msghdr, 0);
1513         send_errno = errno;
1514
1515         /*
1516          * Check for error or block condition.
1517          */
1518         if (cc < 0) {
1519                 if (send_errno == EINTR && ++attempts < NRETRIES)
1520                         goto resend;
1521
1522                 if (SOFT_ERROR(send_errno))
1523                         return (DOIO_SOFT);
1524
1525 #define SOFT_OR_HARD(_system, _isc) \
1526         if (send_errno == _system) { \
1527                 if (sock->connected) { \
1528                         dev->result = _isc; \
1529                         return (DOIO_HARD); \
1530                 } \
1531                 return (DOIO_SOFT); \
1532         }
1533 #define ALWAYS_HARD(_system, _isc) \
1534         if (send_errno == _system) { \
1535                 dev->result = _isc; \
1536                 return (DOIO_HARD); \
1537         }
1538
1539                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1540                 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1541                 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1542                 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1543                 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1544 #ifdef EHOSTDOWN
1545                 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1546 #endif
1547                 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1548                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1549                 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1550                 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1551                 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1552
1553 #undef SOFT_OR_HARD
1554 #undef ALWAYS_HARD
1555
1556                 /*
1557                  * The other error types depend on whether or not the
1558                  * socket is UDP or TCP.  If it is UDP, some errors
1559                  * that we expect to be fatal under TCP are merely
1560                  * annoying, and are really soft errors.
1561                  *
1562                  * However, these soft errors are still returned as
1563                  * a status.
1564                  */
1565                 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1566                 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1567                 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1568                                  addrbuf, strbuf);
1569                 dev->result = isc__errno2result(send_errno);
1570                 return (DOIO_HARD);
1571         }
1572
1573         if (cc == 0)
1574                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1575                                  "internal_send: send() %s 0",
1576                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1577                                                 ISC_MSG_RETURNED, "returned"));
1578
1579         /*
1580          * If we write less than we expected, update counters, poke.
1581          */
1582         dev->n += cc;
1583         if ((size_t)cc != write_count)
1584                 return (DOIO_SOFT);
1585
1586         /*
1587          * Exactly what we wanted to write.  We're done with this
1588          * entry.  Post its completion event.
1589          */
1590         dev->result = ISC_R_SUCCESS;
1591         return (DOIO_SUCCESS);
1592 }
1593
1594 /*
1595  * Kill.
1596  *
1597  * Caller must ensure that the socket is not locked and no external
1598  * references exist.
1599  */
1600 static void
1601 closesocket(isc_socketmgr_t *manager, isc_sockettype_t type, int fd) {
1602         int lockid = FDLOCK_ID(fd);
1603
1604         UNUSED(type);
1605
1606         /*
1607          * No one has this socket open, so the watcher doesn't have to be
1608          * poked, and the socket doesn't have to be locked.
1609          */
1610         LOCK(&manager->fdlock[lockid]);
1611         manager->fds[fd] = NULL;
1612         manager->fdstate[fd] = CLOSE_PENDING;
1613         UNLOCK(&manager->fdlock[lockid]);
1614         select_poke(manager, fd, SELECT_POKE_CLOSE);
1615
1616         /*
1617          * update manager->maxfd here (XXX: this should be implemented more
1618          * efficiently)
1619          */
1620 #ifdef USE_SELECT
1621         LOCK(&manager->lock);
1622         if (manager->maxfd == fd) {
1623                 int i;
1624
1625                 manager->maxfd = 0;
1626                 for (i = fd - 1; i >= 0; i--) {
1627                         lockid = FDLOCK_ID(i);
1628
1629                         LOCK(&manager->fdlock[lockid]);
1630                         if (manager->fdstate[i] == MANAGED) {
1631                                 manager->maxfd = i;
1632                                 UNLOCK(&manager->fdlock[lockid]);
1633                                 break;
1634                         }
1635                         UNLOCK(&manager->fdlock[lockid]);
1636                 }
1637 #ifdef ISC_PLATFORM_USETHREADS
1638                 if (manager->maxfd < manager->pipe_fds[0])
1639                         manager->maxfd = manager->pipe_fds[0];
1640 #endif
1641         }
1642         UNLOCK(&manager->lock);
1643 #endif  /* USE_SELECT */
1644 }
1645
1646 static void
1647 destroy(isc_socket_t **sockp) {
1648         int fd;
1649         isc_socket_t *sock = *sockp;
1650         isc_socketmgr_t *manager = sock->manager;
1651
1652         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1653                    ISC_MSG_DESTROYING, "destroying");
1654
1655         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1656         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1657         INSIST(ISC_LIST_EMPTY(sock->send_list));
1658         INSIST(sock->connect_ev == NULL);
1659         REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks);
1660
1661         if (sock->fd >= 0) {
1662                 fd = sock->fd;
1663                 sock->fd = -1;
1664                 closesocket(manager, sock->type, fd);
1665         }
1666
1667         LOCK(&manager->lock);
1668
1669         ISC_LIST_UNLINK(manager->socklist, sock, link);
1670
1671 #ifdef ISC_PLATFORM_USETHREADS
1672         if (ISC_LIST_EMPTY(manager->socklist))
1673                 SIGNAL(&manager->shutdown_ok);
1674 #endif /* ISC_PLATFORM_USETHREADS */
1675
1676         UNLOCK(&manager->lock);
1677
1678         free_socket(sockp);
1679 }
1680
1681 static isc_result_t
1682 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1683                 isc_socket_t **socketp)
1684 {
1685         isc_socket_t *sock;
1686         isc_result_t result;
1687         ISC_SOCKADDR_LEN_T cmsgbuflen;
1688
1689         sock = isc_mem_get(manager->mctx, sizeof(*sock));
1690
1691         if (sock == NULL)
1692                 return (ISC_R_NOMEMORY);
1693
1694         result = ISC_R_UNEXPECTED;
1695
1696         sock->magic = 0;
1697         sock->references = 0;
1698
1699         sock->manager = manager;
1700         sock->type = type;
1701         sock->fd = -1;
1702
1703         ISC_LINK_INIT(sock, link);
1704
1705         sock->recvcmsgbuf = NULL;
1706         sock->sendcmsgbuf = NULL;
1707
1708         /*
1709          * set up cmsg buffers
1710          */
1711         cmsgbuflen = 0;
1712 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1713         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1714 #endif
1715 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
1716         cmsgbuflen += cmsg_space(sizeof(struct timeval));
1717 #endif
1718         sock->recvcmsgbuflen = cmsgbuflen;
1719         if (sock->recvcmsgbuflen != 0U) {
1720                 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1721                 if (sock->recvcmsgbuf == NULL)
1722                         goto error;
1723         }
1724
1725         cmsgbuflen = 0;
1726 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1727         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1728 #endif
1729         sock->sendcmsgbuflen = cmsgbuflen;
1730         if (sock->sendcmsgbuflen != 0U) {
1731                 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1732                 if (sock->sendcmsgbuf == NULL)
1733                         goto error;
1734         }
1735
1736         /*
1737          * set up list of readers and writers to be initially empty
1738          */
1739         ISC_LIST_INIT(sock->recv_list);
1740         ISC_LIST_INIT(sock->send_list);
1741         ISC_LIST_INIT(sock->accept_list);
1742         sock->connect_ev = NULL;
1743         sock->pending_recv = 0;
1744         sock->pending_send = 0;
1745         sock->pending_accept = 0;
1746         sock->listener = 0;
1747         sock->connected = 0;
1748         sock->connecting = 0;
1749         sock->bound = 0;
1750
1751         /*
1752          * initialize the lock
1753          */
1754         result = isc_mutex_init(&sock->lock);
1755         if (result != ISC_R_SUCCESS) {
1756                 sock->magic = 0;
1757                 goto error;
1758         }
1759
1760         /*
1761          * Initialize readable and writable events
1762          */
1763         ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1764                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1765                        NULL, sock, sock, NULL, NULL);
1766         ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1767                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1768                        NULL, sock, sock, NULL, NULL);
1769
1770         sock->magic = SOCKET_MAGIC;
1771         *socketp = sock;
1772
1773         return (ISC_R_SUCCESS);
1774
1775  error:
1776         if (sock->recvcmsgbuf != NULL)
1777                 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
1778                             sock->recvcmsgbuflen);
1779         if (sock->sendcmsgbuf != NULL)
1780                 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
1781                             sock->sendcmsgbuflen);
1782         isc_mem_put(manager->mctx, sock, sizeof(*sock));
1783
1784         return (result);
1785 }
1786
1787 /*
1788  * This event requires that the various lists be empty, that the reference
1789  * count be 1, and that the magic number is valid.  The other socket bits,
1790  * like the lock, must be initialized as well.  The fd associated must be
1791  * marked as closed, by setting it to -1 on close, or this routine will
1792  * also close the socket.
1793  */
1794 static void
1795 free_socket(isc_socket_t **socketp) {
1796         isc_socket_t *sock = *socketp;
1797
1798         INSIST(sock->references == 0);
1799         INSIST(VALID_SOCKET(sock));
1800         INSIST(!sock->connecting);
1801         INSIST(!sock->pending_recv);
1802         INSIST(!sock->pending_send);
1803         INSIST(!sock->pending_accept);
1804         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1805         INSIST(ISC_LIST_EMPTY(sock->send_list));
1806         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1807         INSIST(!ISC_LINK_LINKED(sock, link));
1808
1809         if (sock->recvcmsgbuf != NULL)
1810                 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
1811                             sock->recvcmsgbuflen);
1812         if (sock->sendcmsgbuf != NULL)
1813                 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
1814                             sock->sendcmsgbuflen);
1815
1816         sock->magic = 0;
1817
1818         DESTROYLOCK(&sock->lock);
1819
1820         isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1821
1822         *socketp = NULL;
1823 }
1824
1825 #ifdef SO_BSDCOMPAT
1826 /*
1827  * This really should not be necessary to do.  Having to workout
1828  * which kernel version we are on at run time so that we don't cause
1829  * the kernel to issue a warning about us using a deprecated socket option.
1830  * Such warnings should *never* be on by default in production kernels.
1831  *
1832  * We can't do this a build time because executables are moved between
1833  * machines and hence kernels.
1834  *
1835  * We can't just not set SO_BSDCOMAT because some kernels require it.
1836  */
1837
1838 static isc_once_t         bsdcompat_once = ISC_ONCE_INIT;
1839 isc_boolean_t bsdcompat = ISC_TRUE;
1840
1841 static void
1842 clear_bsdcompat(void) {
1843 #ifdef __linux__
1844          struct utsname buf;
1845          char *endp;
1846          long int major;
1847          long int minor;
1848
1849          uname(&buf);    /* Can only fail if buf is bad in Linux. */
1850
1851          /* Paranoia in parsing can be increased, but we trust uname(). */
1852          major = strtol(buf.release, &endp, 10);
1853          if (*endp == '.') {
1854                 minor = strtol(endp+1, &endp, 10);
1855                 if ((major > 2) || ((major == 2) && (minor >= 4))) {
1856                         bsdcompat = ISC_FALSE;
1857                 }
1858          }
1859 #endif /* __linux __ */
1860 }
1861 #endif
1862
1863 static isc_result_t
1864 opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) {
1865         char strbuf[ISC_STRERRORSIZE];
1866         const char *err = "socket";
1867         int tries = 0;
1868 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
1869         int on = 1;
1870 #endif
1871 #if defined(SO_RCVBUF)
1872         ISC_SOCKADDR_LEN_T optlen;
1873         int size;
1874 #endif
1875
1876  again:
1877         switch (sock->type) {
1878         case isc_sockettype_udp:
1879                 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
1880                 break;
1881         case isc_sockettype_tcp:
1882                 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
1883                 break;
1884         case isc_sockettype_unix:
1885                 sock->fd = socket(sock->pf, SOCK_STREAM, 0);
1886                 break;
1887         }
1888         if (sock->fd == -1 && errno == EINTR && tries++ < 42)
1889                 goto again;
1890
1891 #ifdef F_DUPFD
1892         /*
1893          * Leave a space for stdio and TCP to work in.
1894          */
1895         if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
1896             sock->fd >= 0 && sock->fd < manager->reserved) {
1897                 int new, tmp;
1898                 new = fcntl(sock->fd, F_DUPFD, manager->reserved);
1899                 tmp = errno;
1900                 (void)close(sock->fd);
1901                 errno = tmp;
1902                 sock->fd = new;
1903                 err = "isc_socket_create: fcntl/reserved";
1904         } else if (sock->fd >= 0 && sock->fd < 20) {
1905                 int new, tmp;
1906                 new = fcntl(sock->fd, F_DUPFD, 20);
1907                 tmp = errno;
1908                 (void)close(sock->fd);
1909                 errno = tmp;
1910                 sock->fd = new;
1911                 err = "isc_socket_create: fcntl";
1912         }
1913 #endif
1914
1915         if (sock->fd >= (int)manager->maxsocks) {
1916                 (void)close(sock->fd);
1917                 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1918                                ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1919                                isc_msgcat, ISC_MSGSET_SOCKET,
1920                                ISC_MSG_TOOMANYFDS,
1921                                "socket: file descriptor exceeds limit (%d/%u)",
1922                                sock->fd, manager->maxsocks);
1923                 return (ISC_R_NORESOURCES);
1924         }
1925
1926         if (sock->fd < 0) {
1927                 switch (errno) {
1928                 case EMFILE:
1929                 case ENFILE:
1930                 case ENOBUFS:
1931                         return (ISC_R_NORESOURCES);
1932
1933                 case EPROTONOSUPPORT:
1934                 case EPFNOSUPPORT:
1935                 case EAFNOSUPPORT:
1936                 /*
1937                  * Linux 2.2 (and maybe others) return EINVAL instead of
1938                  * EAFNOSUPPORT.
1939                  */
1940                 case EINVAL:
1941                         return (ISC_R_FAMILYNOSUPPORT);
1942
1943                 default:
1944                         isc__strerror(errno, strbuf, sizeof(strbuf));
1945                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1946                                          "%s() %s: %s", err,
1947                                          isc_msgcat_get(isc_msgcat,
1948                                                         ISC_MSGSET_GENERAL,
1949                                                         ISC_MSG_FAILED,
1950                                                         "failed"),
1951                                          strbuf);
1952                         return (ISC_R_UNEXPECTED);
1953                 }
1954         }
1955
1956         if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
1957                 (void)close(sock->fd);
1958                 return (ISC_R_UNEXPECTED);
1959         }
1960
1961 #ifdef SO_BSDCOMPAT
1962         RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
1963                                   clear_bsdcompat) == ISC_R_SUCCESS);
1964         if (sock->type != isc_sockettype_unix && bsdcompat &&
1965             setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
1966                        (void *)&on, sizeof(on)) < 0) {
1967                 isc__strerror(errno, strbuf, sizeof(strbuf));
1968                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1969                                  "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
1970                                  sock->fd,
1971                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1972                                                 ISC_MSG_FAILED, "failed"),
1973                                  strbuf);
1974                 /* Press on... */
1975         }
1976 #endif
1977
1978 #ifdef SO_NOSIGPIPE
1979         if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE,
1980                        (void *)&on, sizeof(on)) < 0) {
1981                 isc__strerror(errno, strbuf, sizeof(strbuf));
1982                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1983                                  "setsockopt(%d, SO_NOSIGPIPE) %s: %s",
1984                                  sock->fd,
1985                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1986                                                 ISC_MSG_FAILED, "failed"),
1987                                  strbuf);
1988                 /* Press on... */
1989         }
1990 #endif
1991
1992 #if defined(USE_CMSG) || defined(SO_RCVBUF)
1993         if (sock->type == isc_sockettype_udp) {
1994
1995 #if defined(USE_CMSG)
1996 #if defined(SO_TIMESTAMP)
1997                 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1998                                (void *)&on, sizeof(on)) < 0
1999                     && errno != ENOPROTOOPT) {
2000                         isc__strerror(errno, strbuf, sizeof(strbuf));
2001                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2002                                          "setsockopt(%d, SO_TIMESTAMP) %s: %s",
2003                                          sock->fd,
2004                                          isc_msgcat_get(isc_msgcat,
2005                                                         ISC_MSGSET_GENERAL,
2006                                                         ISC_MSG_FAILED,
2007                                                         "failed"),
2008                                          strbuf);
2009                         /* Press on... */
2010                 }
2011 #endif /* SO_TIMESTAMP */
2012
2013 #if defined(ISC_PLATFORM_HAVEIPV6)
2014                 if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
2015                         /*
2016                          * Warn explicitly because this anomaly can be hidden
2017                          * in usual operation (and unexpectedly appear later).
2018                          */
2019                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2020                                          "No buffer available to receive "
2021                                          "IPv6 destination");
2022                 }
2023 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
2024 #ifdef IPV6_RECVPKTINFO
2025                 /* RFC 3542 */
2026                 if ((sock->pf == AF_INET6)
2027                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2028                                    (void *)&on, sizeof(on)) < 0)) {
2029                         isc__strerror(errno, strbuf, sizeof(strbuf));
2030                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2031                                          "setsockopt(%d, IPV6_RECVPKTINFO) "
2032                                          "%s: %s", sock->fd,
2033                                          isc_msgcat_get(isc_msgcat,
2034                                                         ISC_MSGSET_GENERAL,
2035                                                         ISC_MSG_FAILED,
2036                                                         "failed"),
2037                                          strbuf);
2038                 }
2039 #else
2040                 /* RFC 2292 */
2041                 if ((sock->pf == AF_INET6)
2042                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2043                                    (void *)&on, sizeof(on)) < 0)) {
2044                         isc__strerror(errno, strbuf, sizeof(strbuf));
2045                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2046                                          "setsockopt(%d, IPV6_PKTINFO) %s: %s",
2047                                          sock->fd,
2048                                          isc_msgcat_get(isc_msgcat,
2049                                                         ISC_MSGSET_GENERAL,
2050                                                         ISC_MSG_FAILED,
2051                                                         "failed"),
2052                                          strbuf);
2053                 }
2054 #endif /* IPV6_RECVPKTINFO */
2055 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
2056 #ifdef IPV6_USE_MIN_MTU        /* RFC 3542, not too common yet*/
2057                 /* use minimum MTU */
2058                 if (sock->pf == AF_INET6) {
2059                         (void)setsockopt(sock->fd, IPPROTO_IPV6,
2060                                          IPV6_USE_MIN_MTU,
2061                                          (void *)&on, sizeof(on));
2062                 }
2063 #endif
2064 #endif /* ISC_PLATFORM_HAVEIPV6 */
2065 #endif /* defined(USE_CMSG) */
2066
2067 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
2068                 /*
2069                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2070                  */
2071                 if (sock->pf == AF_INET) {
2072                         int action = IP_PMTUDISC_DONT;
2073                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2074                                          &action, sizeof(action));
2075                 }
2076 #endif
2077 #if defined(IP_DONTFRAG)
2078                 /*
2079                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2080                  */
2081                 if (sock->pf == AF_INET) {
2082                         int off = 0;
2083                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG,
2084                                          &off, sizeof(off));
2085                 }
2086 #endif
2087
2088 #if defined(SO_RCVBUF)
2089                 optlen = sizeof(size);
2090                 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2091                                (void *)&size, &optlen) >= 0 &&
2092                      size < RCVBUFSIZE) {
2093                         size = RCVBUFSIZE;
2094                         if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2095                                        (void *)&size, sizeof(size)) == -1) {
2096                                 isc__strerror(errno, strbuf, sizeof(strbuf));
2097                                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2098                                         "setsockopt(%d, SO_RCVBUF, %d) %s: %s",
2099                                         sock->fd, size,
2100                                         isc_msgcat_get(isc_msgcat,
2101                                                        ISC_MSGSET_GENERAL,
2102                                                        ISC_MSG_FAILED,
2103                                                        "failed"),
2104                                         strbuf);
2105                         }
2106                 }
2107 #endif
2108         }
2109 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
2110
2111         return (ISC_R_SUCCESS);
2112 }
2113
2114 /*%
2115  * Create a new 'type' socket managed by 'manager'.  Events
2116  * will be posted to 'task' and when dispatched 'action' will be
2117  * called with 'arg' as the arg value.  The new socket is returned
2118  * in 'socketp'.
2119  */
2120 isc_result_t
2121 isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
2122                   isc_socket_t **socketp)
2123 {
2124         isc_socket_t *sock = NULL;
2125         isc_result_t result;
2126         int lockid;
2127
2128         REQUIRE(VALID_MANAGER(manager));
2129         REQUIRE(socketp != NULL && *socketp == NULL);
2130
2131         result = allocate_socket(manager, type, &sock);
2132         if (result != ISC_R_SUCCESS)
2133                 return (result);
2134
2135         sock->pf = pf;
2136         result = opensocket(manager, sock);
2137         if (result != ISC_R_SUCCESS) {
2138                 free_socket(&sock);
2139                 return (result);
2140         }
2141
2142         sock->references = 1;
2143         *socketp = sock;
2144
2145         /*
2146          * Note we don't have to lock the socket like we normally would because
2147          * there are no external references to it yet.
2148          */
2149
2150         lockid = FDLOCK_ID(sock->fd);
2151         LOCK(&manager->fdlock[lockid]);
2152         manager->fds[sock->fd] = sock;
2153         manager->fdstate[sock->fd] = MANAGED;
2154 #ifdef USE_DEVPOLL
2155         INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2156                sock->manager->fdpollinfo[sock->fd].want_write == 0);
2157 #endif
2158         UNLOCK(&manager->fdlock[lockid]);
2159
2160         LOCK(&manager->lock);
2161         ISC_LIST_APPEND(manager->socklist, sock, link);
2162 #ifdef USE_SELECT
2163         if (manager->maxfd < sock->fd)
2164                 manager->maxfd = sock->fd;
2165 #endif
2166         UNLOCK(&manager->lock);
2167
2168         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2169                    ISC_MSG_CREATED, "created");
2170
2171         return (ISC_R_SUCCESS);
2172 }
2173
2174 isc_result_t
2175 isc_socket_open(isc_socket_t *sock) {
2176         isc_result_t result;
2177
2178         REQUIRE(VALID_SOCKET(sock));
2179
2180         LOCK(&sock->lock);
2181         REQUIRE(sock->references == 1);
2182         UNLOCK(&sock->lock);
2183         /*
2184          * We don't need to retain the lock hereafter, since no one else has
2185          * this socket.
2186          */
2187         REQUIRE(sock->fd == -1);
2188
2189         result = opensocket(sock->manager, sock);
2190         if (result != ISC_R_SUCCESS)
2191                 sock->fd = -1;
2192
2193         if (result == ISC_R_SUCCESS) {
2194                 int lockid = FDLOCK_ID(sock->fd);
2195
2196                 LOCK(&sock->manager->fdlock[lockid]);
2197                 sock->manager->fds[sock->fd] = sock;
2198                 sock->manager->fdstate[sock->fd] = MANAGED;
2199 #ifdef USE_DEVPOLL
2200                 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2201                        sock->manager->fdpollinfo[sock->fd].want_write == 0);
2202 #endif
2203                 UNLOCK(&sock->manager->fdlock[lockid]);
2204
2205 #ifdef USE_SELECT
2206                 LOCK(&sock->manager->lock);
2207                 if (sock->manager->maxfd < sock->fd)
2208                         sock->manager->maxfd = sock->fd;
2209                 UNLOCK(&sock->manager->lock);
2210 #endif
2211         }
2212
2213         return (result);
2214 }
2215
2216 /*
2217  * Attach to a socket.  Caller must explicitly detach when it is done.
2218  */
2219 void
2220 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
2221         REQUIRE(VALID_SOCKET(sock));
2222         REQUIRE(socketp != NULL && *socketp == NULL);
2223
2224         LOCK(&sock->lock);
2225         sock->references++;
2226         UNLOCK(&sock->lock);
2227
2228         *socketp = sock;
2229 }
2230
2231 /*
2232  * Dereference a socket.  If this is the last reference to it, clean things
2233  * up by destroying the socket.
2234  */
2235 void
2236 isc_socket_detach(isc_socket_t **socketp) {
2237         isc_socket_t *sock;
2238         isc_boolean_t kill_socket = ISC_FALSE;
2239
2240         REQUIRE(socketp != NULL);
2241         sock = *socketp;
2242         REQUIRE(VALID_SOCKET(sock));
2243
2244         LOCK(&sock->lock);
2245         REQUIRE(sock->references > 0);
2246         sock->references--;
2247         if (sock->references == 0)
2248                 kill_socket = ISC_TRUE;
2249         UNLOCK(&sock->lock);
2250
2251         if (kill_socket)
2252                 destroy(&sock);
2253
2254         *socketp = NULL;
2255 }
2256
2257 isc_result_t
2258 isc_socket_close(isc_socket_t *sock) {
2259         int fd;
2260
2261         REQUIRE(VALID_SOCKET(sock));
2262
2263         LOCK(&sock->lock);
2264         REQUIRE(sock->references == 1);
2265         UNLOCK(&sock->lock);
2266         /*
2267          * We don't need to retain the lock hereafter, since no one else has
2268          * this socket.
2269          */
2270
2271         REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2272
2273         INSIST(!sock->connecting);
2274         INSIST(!sock->pending_recv);
2275         INSIST(!sock->pending_send);
2276         INSIST(!sock->pending_accept);
2277         INSIST(ISC_LIST_EMPTY(sock->recv_list));
2278         INSIST(ISC_LIST_EMPTY(sock->send_list));
2279         INSIST(ISC_LIST_EMPTY(sock->accept_list));
2280         INSIST(sock->connect_ev == NULL);
2281
2282         fd = sock->fd;
2283         sock->fd = -1;
2284         sock->listener = 0;
2285         sock->connected = 0;
2286         sock->connecting = 0;
2287         sock->bound = 0;
2288         isc_sockaddr_any(&sock->address);
2289
2290         closesocket(sock->manager, sock->type, fd);
2291
2292         return (ISC_R_SUCCESS);
2293 }
2294
2295 /*
2296  * I/O is possible on a given socket.  Schedule an event to this task that
2297  * will call an internal function to do the I/O.  This will charge the
2298  * task with the I/O operation and let our select loop handler get back
2299  * to doing something real as fast as possible.
2300  *
2301  * The socket and manager must be locked before calling this function.
2302  */
2303 static void
2304 dispatch_recv(isc_socket_t *sock) {
2305         intev_t *iev;
2306         isc_socketevent_t *ev;
2307
2308         INSIST(!sock->pending_recv);
2309
2310         ev = ISC_LIST_HEAD(sock->recv_list);
2311         if (ev == NULL)
2312                 return;
2313
2314         sock->pending_recv = 1;
2315         iev = &sock->readable_ev;
2316
2317         socket_log(sock, NULL, EVENT, NULL, 0, 0,
2318                    "dispatch_recv:  event %p -> task %p", ev, ev->ev_sender);
2319
2320         sock->references++;
2321         iev->ev_sender = sock;
2322         iev->ev_action = internal_recv;
2323         iev->ev_arg = sock;
2324
2325         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2326 }
2327
2328 static void
2329 dispatch_send(isc_socket_t *sock) {
2330         intev_t *iev;
2331         isc_socketevent_t *ev;
2332
2333         INSIST(!sock->pending_send);
2334
2335         ev = ISC_LIST_HEAD(sock->send_list);
2336         if (ev == NULL)
2337                 return;
2338
2339         sock->pending_send = 1;
2340         iev = &sock->writable_ev;
2341
2342         socket_log(sock, NULL, EVENT, NULL, 0, 0,
2343                    "dispatch_send:  event %p -> task %p", ev, ev->ev_sender);
2344
2345         sock->references++;
2346         iev->ev_sender = sock;
2347         iev->ev_action = internal_send;
2348         iev->ev_arg = sock;
2349
2350         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2351 }
2352
2353 /*
2354  * Dispatch an internal accept event.
2355  */
2356 static void
2357 dispatch_accept(isc_socket_t *sock) {
2358         intev_t *iev;
2359         isc_socket_newconnev_t *ev;
2360
2361         INSIST(!sock->pending_accept);
2362
2363         /*
2364          * Are there any done events left, or were they all canceled
2365          * before the manager got the socket lock?
2366          */
2367         ev = ISC_LIST_HEAD(sock->accept_list);
2368         if (ev == NULL)
2369                 return;
2370
2371         sock->pending_accept = 1;
2372         iev = &sock->readable_ev;
2373
2374         sock->references++;  /* keep socket around for this internal event */
2375         iev->ev_sender = sock;
2376         iev->ev_action = internal_accept;
2377         iev->ev_arg = sock;
2378
2379         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2380 }
2381
2382 static void
2383 dispatch_connect(isc_socket_t *sock) {
2384         intev_t *iev;
2385         isc_socket_connev_t *ev;
2386
2387         iev = &sock->writable_ev;
2388
2389         ev = sock->connect_ev;
2390         INSIST(ev != NULL); /* XXX */
2391
2392         INSIST(sock->connecting);
2393
2394         sock->references++;  /* keep socket around for this internal event */
2395         iev->ev_sender = sock;
2396         iev->ev_action = internal_connect;
2397         iev->ev_arg = sock;
2398
2399         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2400 }
2401
2402 /*
2403  * Dequeue an item off the given socket's read queue, set the result code
2404  * in the done event to the one provided, and send it to the task it was
2405  * destined for.
2406  *
2407  * If the event to be sent is on a list, remove it before sending.  If
2408  * asked to, send and detach from the socket as well.
2409  *
2410  * Caller must have the socket locked if the event is attached to the socket.
2411  */
2412 static void
2413 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2414         isc_task_t *task;
2415
2416         task = (*dev)->ev_sender;
2417
2418         (*dev)->ev_sender = sock;
2419
2420         if (ISC_LINK_LINKED(*dev, ev_link))
2421                 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2422
2423         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
2424             == ISC_SOCKEVENTATTR_ATTACHED)
2425                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
2426         else
2427                 isc_task_send(task, (isc_event_t **)dev);
2428 }
2429
2430 /*
2431  * See comments for send_recvdone_event() above.
2432  *
2433  * Caller must have the socket locked if the event is attached to the socket.
2434  */
2435 static void
2436 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2437         isc_task_t *task;
2438
2439         INSIST(dev != NULL && *dev != NULL);
2440
2441         task = (*dev)->ev_sender;
2442         (*dev)->ev_sender = sock;
2443
2444         if (ISC_LINK_LINKED(*dev, ev_link))
2445                 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
2446
2447         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
2448             == ISC_SOCKEVENTATTR_ATTACHED)
2449                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
2450         else
2451                 isc_task_send(task, (isc_event_t **)dev);
2452 }
2453
2454 /*
2455  * Call accept() on a socket, to get the new file descriptor.  The listen
2456  * socket is used as a prototype to create a new isc_socket_t.  The new
2457  * socket has one outstanding reference.  The task receiving the event
2458  * will be detached from just after the event is delivered.
2459  *
2460  * On entry to this function, the event delivered is the internal
2461  * readable event, and the first item on the accept_list should be
2462  * the done event we want to send.  If the list is empty, this is a no-op,
2463  * so just unlock and return.
2464  */
2465 static void
2466 internal_accept(isc_task_t *me, isc_event_t *ev) {
2467         isc_socket_t *sock;
2468         isc_socketmgr_t *manager;
2469         isc_socket_newconnev_t *dev;
2470         isc_task_t *task;
2471         ISC_SOCKADDR_LEN_T addrlen;
2472         int fd;
2473         isc_result_t result = ISC_R_SUCCESS;
2474         char strbuf[ISC_STRERRORSIZE];
2475         const char *err = "accept";
2476
2477         UNUSED(me);
2478
2479         sock = ev->ev_sender;
2480         INSIST(VALID_SOCKET(sock));
2481
2482         LOCK(&sock->lock);
2483         socket_log(sock, NULL, TRACE,
2484                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2485                    "internal_accept called, locked socket");
2486
2487         manager = sock->manager;
2488         INSIST(VALID_MANAGER(manager));
2489
2490         INSIST(sock->listener);
2491         INSIST(sock->pending_accept == 1);
2492         sock->pending_accept = 0;
2493
2494         INSIST(sock->references > 0);
2495         sock->references--;  /* the internal event is done with this socket */
2496         if (sock->references == 0) {
2497                 UNLOCK(&sock->lock);
2498                 destroy(&sock);
2499                 return;
2500         }
2501
2502         /*
2503          * Get the first item off the accept list.
2504          * If it is empty, unlock the socket and return.
2505          */
2506         dev = ISC_LIST_HEAD(sock->accept_list);
2507         if (dev == NULL) {
2508                 UNLOCK(&sock->lock);
2509                 return;
2510         }
2511
2512         /*
2513          * Try to accept the new connection.  If the accept fails with
2514          * EAGAIN or EINTR, simply poke the watcher to watch this socket
2515          * again.  Also ignore ECONNRESET, which has been reported to
2516          * be spuriously returned on Linux 2.2.19 although it is not
2517          * a documented error for accept().  ECONNABORTED has been
2518          * reported for Solaris 8.  The rest are thrown in not because
2519          * we have seen them but because they are ignored by other
2520          * deamons such as BIND 8 and Apache.
2521          */
2522
2523         addrlen = sizeof(dev->newsocket->address.type);
2524         memset(&dev->newsocket->address.type, 0, addrlen);
2525         fd = accept(sock->fd, &dev->newsocket->address.type.sa,
2526                     (void *)&addrlen);
2527
2528 #ifdef F_DUPFD
2529         /*
2530          * Leave a space for stdio to work in.
2531          */
2532         if (fd >= 0 && fd < 20) {
2533                 int new, tmp;
2534                 new = fcntl(fd, F_DUPFD, 20);
2535                 tmp = errno;
2536                 (void)close(fd);
2537                 errno = tmp;
2538                 fd = new;
2539                 err = "accept/fcntl";
2540         }
2541 #endif
2542
2543         if (fd < 0) {
2544                 if (SOFT_ERROR(errno))
2545                         goto soft_error;
2546                 switch (errno) {
2547                 case ENFILE:
2548                 case EMFILE:
2549                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2550                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2551                                        isc_msgcat, ISC_MSGSET_SOCKET,
2552                                        ISC_MSG_TOOMANYFDS,
2553                                        "%s: too many open file descriptors",
2554                                        err);
2555                         goto soft_error;
2556
2557                 case ENOBUFS:
2558                 case ENOMEM:
2559                 case ECONNRESET:
2560                 case ECONNABORTED:
2561                 case EHOSTUNREACH:
2562                 case EHOSTDOWN:
2563                 case ENETUNREACH:
2564                 case ENETDOWN:
2565                 case ECONNREFUSED:
2566 #ifdef EPROTO
2567                 case EPROTO:
2568 #endif
2569 #ifdef ENONET
2570                 case ENONET:
2571 #endif
2572                         goto soft_error;
2573                 default:
2574                         break;
2575                 }
2576                 isc__strerror(errno, strbuf, sizeof(strbuf));
2577                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2578                                  "internal_accept: %s() %s: %s", err,
2579                                  isc_msgcat_get(isc_msgcat,
2580                                                 ISC_MSGSET_GENERAL,
2581                                                 ISC_MSG_FAILED,
2582                                                 "failed"),
2583                                  strbuf);
2584                 fd = -1;
2585                 result = ISC_R_UNEXPECTED;
2586         } else {
2587                 if (addrlen == 0U) {
2588                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2589                                          "internal_accept(): "
2590                                          "accept() failed to return "
2591                                          "remote address");
2592
2593                         (void)close(fd);
2594                         goto soft_error;
2595                 } else if (dev->newsocket->address.type.sa.sa_family !=
2596                            sock->pf)
2597                 {
2598                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2599                                          "internal_accept(): "
2600                                          "accept() returned peer address "
2601                                          "family %u (expected %u)",
2602                                          dev->newsocket->address.
2603                                          type.sa.sa_family,
2604                                          sock->pf);
2605                         (void)close(fd);
2606                         goto soft_error;
2607                 } else if (fd >= (int)manager->maxsocks) {
2608                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2609                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2610                                        isc_msgcat, ISC_MSGSET_SOCKET,
2611                                        ISC_MSG_TOOMANYFDS,
2612                                        "accept: "
2613                                        "file descriptor exceeds limit (%d/%u)",
2614                                        fd, manager->maxsocks);
2615                         (void)close(fd);
2616                         goto soft_error;
2617                 }
2618         }
2619
2620         if (fd != -1) {
2621                 dev->newsocket->address.length = addrlen;
2622                 dev->newsocket->pf = sock->pf;
2623         }
2624
2625         /*
2626          * Pull off the done event.
2627          */
2628         ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2629
2630         /*
2631          * Poke watcher if there are more pending accepts.
2632          */
2633         if (!ISC_LIST_EMPTY(sock->accept_list))
2634                 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2635
2636         UNLOCK(&sock->lock);
2637
2638         if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) {
2639                 (void)close(fd);
2640                 fd = -1;
2641                 result = ISC_R_UNEXPECTED;
2642         }
2643
2644         /*
2645          * -1 means the new socket didn't happen.
2646          */
2647         if (fd != -1) {
2648                 int lockid = FDLOCK_ID(fd);
2649
2650                 LOCK(&manager->fdlock[lockid]);
2651                 manager->fds[fd] = dev->newsocket;
2652                 manager->fdstate[fd] = MANAGED;
2653                 UNLOCK(&manager->fdlock[lockid]);
2654
2655                 LOCK(&manager->lock);
2656                 ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
2657
2658                 dev->newsocket->fd = fd;
2659                 dev->newsocket->bound = 1;
2660                 dev->newsocket->connected = 1;
2661
2662                 /*
2663                  * Save away the remote address
2664                  */
2665                 dev->address = dev->newsocket->address;
2666
2667 #ifdef USE_SELECT
2668                 if (manager->maxfd < fd)
2669                         manager->maxfd = fd;
2670 #endif
2671
2672                 socket_log(sock, &dev->newsocket->address, CREATION,
2673                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2674                            "accepted connection, new socket %p",
2675                            dev->newsocket);
2676
2677                 UNLOCK(&manager->lock);
2678         } else {
2679                 dev->newsocket->references--;
2680                 free_socket(&dev->newsocket);
2681         }
2682
2683         /*
2684          * Fill in the done event details and send it off.
2685          */
2686         dev->result = result;
2687         task = dev->ev_sender;
2688         dev->ev_sender = sock;
2689
2690         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2691         return;
2692
2693  soft_error:
2694         select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2695         UNLOCK(&sock->lock);
2696         return;
2697 }
2698
2699 static void
2700 internal_recv(isc_task_t *me, isc_event_t *ev) {
2701         isc_socketevent_t *dev;
2702         isc_socket_t *sock;
2703
2704         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
2705
2706         sock = ev->ev_sender;
2707         INSIST(VALID_SOCKET(sock));
2708
2709         LOCK(&sock->lock);
2710         socket_log(sock, NULL, IOEVENT,
2711                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2712                    "internal_recv: task %p got event %p", me, ev);
2713
2714         INSIST(sock->pending_recv == 1);
2715         sock->pending_recv = 0;
2716
2717         INSIST(sock->references > 0);
2718         sock->references--;  /* the internal event is done with this socket */
2719         if (sock->references == 0) {
2720                 UNLOCK(&sock->lock);
2721                 destroy(&sock);
2722                 return;
2723         }
2724
2725         /*
2726          * Try to do as much I/O as possible on this socket.  There are no
2727          * limits here, currently.
2728          */
2729         dev = ISC_LIST_HEAD(sock->recv_list);
2730         while (dev != NULL) {
2731                 switch (doio_recv(sock, dev)) {
2732                 case DOIO_SOFT:
2733                         goto poke;
2734
2735                 case DOIO_EOF:
2736                         /*
2737                          * read of 0 means the remote end was closed.
2738                          * Run through the event queue and dispatch all
2739                          * the events with an EOF result code.
2740                          */
2741                         do {
2742                                 dev->result = ISC_R_EOF;
2743                                 send_recvdone_event(sock, &dev);
2744                                 dev = ISC_LIST_HEAD(sock->recv_list);
2745                         } while (dev != NULL);
2746                         goto poke;
2747
2748                 case DOIO_SUCCESS:
2749                 case DOIO_HARD:
2750                         send_recvdone_event(sock, &dev);
2751                         break;
2752                 }
2753
2754                 dev = ISC_LIST_HEAD(sock->recv_list);
2755         }
2756
2757  poke:
2758         if (!ISC_LIST_EMPTY(sock->recv_list))
2759                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2760
2761         UNLOCK(&sock->lock);
2762 }
2763
2764 static void
2765 internal_send(isc_task_t *me, isc_event_t *ev) {
2766         isc_socketevent_t *dev;
2767         isc_socket_t *sock;
2768
2769         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2770
2771         /*
2772          * Find out what socket this is and lock it.
2773          */
2774         sock = (isc_socket_t *)ev->ev_sender;
2775         INSIST(VALID_SOCKET(sock));
2776
2777         LOCK(&sock->lock);
2778         socket_log(sock, NULL, IOEVENT,
2779                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2780                    "internal_send: task %p got event %p", me, ev);
2781
2782         INSIST(sock->pending_send == 1);
2783         sock->pending_send = 0;
2784
2785         INSIST(sock->references > 0);
2786         sock->references--;  /* the internal event is done with this socket */
2787         if (sock->references == 0) {
2788                 UNLOCK(&sock->lock);
2789                 destroy(&sock);
2790                 return;
2791         }
2792
2793         /*
2794          * Try to do as much I/O as possible on this socket.  There are no
2795          * limits here, currently.
2796          */
2797         dev = ISC_LIST_HEAD(sock->send_list);
2798         while (dev != NULL) {
2799                 switch (doio_send(sock, dev)) {
2800                 case DOIO_SOFT:
2801                         goto poke;
2802
2803                 case DOIO_HARD:
2804                 case DOIO_SUCCESS:
2805                         send_senddone_event(sock, &dev);
2806                         break;
2807                 }
2808
2809                 dev = ISC_LIST_HEAD(sock->send_list);
2810         }
2811
2812  poke:
2813         if (!ISC_LIST_EMPTY(sock->send_list))
2814                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2815
2816         UNLOCK(&sock->lock);
2817 }
2818
2819 /*
2820  * Process read/writes on each fd here.  Avoid locking
2821  * and unlocking twice if both reads and writes are possible.
2822  */
2823 static void
2824 process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable,
2825            isc_boolean_t writeable)
2826 {
2827         isc_socket_t *sock;
2828         isc_boolean_t unlock_sock;
2829         int lockid = FDLOCK_ID(fd);
2830
2831         /*
2832          * If the socket is going to be closed, don't do more I/O.
2833          */
2834         LOCK(&manager->fdlock[lockid]);
2835         if (manager->fdstate[fd] == CLOSE_PENDING) {
2836                 UNLOCK(&manager->fdlock[lockid]);
2837
2838                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
2839                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
2840                 return;
2841         }
2842
2843         sock = manager->fds[fd];
2844         UNLOCK(&manager->fdlock[lockid]);
2845         unlock_sock = ISC_FALSE;
2846         if (readable) {
2847                 if (sock == NULL) {
2848                         (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
2849                         goto check_write;
2850                 }
2851                 unlock_sock = ISC_TRUE;
2852                 LOCK(&sock->lock);
2853                 if (!SOCK_DEAD(sock)) {
2854                         if (sock->listener)
2855                                 dispatch_accept(sock);
2856                         else
2857                                 dispatch_recv(sock);
2858                 }
2859                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
2860         }
2861 check_write:
2862         if (writeable) {
2863                 if (sock == NULL) {
2864                         (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
2865                         return;
2866                 }
2867                 if (!unlock_sock) {
2868                         unlock_sock = ISC_TRUE;
2869                         LOCK(&sock->lock);
2870                 }
2871                 if (!SOCK_DEAD(sock)) {
2872                         if (sock->connecting)
2873                                 dispatch_connect(sock);
2874                         else
2875                                 dispatch_send(sock);
2876                 }
2877                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
2878         }
2879         if (unlock_sock)
2880                 UNLOCK(&sock->lock);
2881 }
2882
2883 #ifdef USE_KQUEUE
2884 static isc_boolean_t
2885 process_fds(isc_socketmgr_t *manager, struct kevent *events, int nevents) {
2886         int i;
2887         isc_boolean_t readable, writable;
2888         isc_boolean_t done = ISC_FALSE;
2889 #ifdef ISC_PLATFORM_USETHREADS
2890         isc_boolean_t have_ctlevent = ISC_FALSE;
2891 #endif
2892
2893         if (nevents == manager->nevents) {
2894                 /*
2895                  * This is not an error, but something unexpected.  If this
2896                  * happens, it may indicate the need for increasing
2897                  * ISC_SOCKET_MAXEVENTS.
2898                  */
2899                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
2900                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
2901                             "maximum number of FD events (%d) received",
2902                             nevents);
2903         }
2904
2905         for (i = 0; i < nevents; i++) {
2906                 REQUIRE(events[i].ident < manager->maxsocks);
2907 #ifdef ISC_PLATFORM_USETHREADS
2908                 if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) {
2909                         have_ctlevent = ISC_TRUE;
2910                         continue;
2911                 }
2912 #endif
2913                 readable = ISC_TF(events[i].filter == EVFILT_READ);
2914                 writable = ISC_TF(events[i].filter == EVFILT_WRITE);
2915                 process_fd(manager, events[i].ident, readable, writable);
2916         }
2917
2918 #ifdef ISC_PLATFORM_USETHREADS
2919         if (have_ctlevent)
2920                 done = process_ctlfd(manager);
2921 #endif
2922
2923         return (done);
2924 }
2925 #elif defined(USE_EPOLL)
2926 static isc_boolean_t
2927 process_fds(isc_socketmgr_t *manager, struct epoll_event *events, int nevents) {
2928         int i;
2929         isc_boolean_t done = ISC_FALSE;
2930 #ifdef ISC_PLATFORM_USETHREADS
2931         isc_boolean_t have_ctlevent = ISC_FALSE;
2932 #endif
2933
2934         if (nevents == manager->nevents) {
2935                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
2936                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
2937                             "maximum number of FD events (%d) received",
2938                             nevents);
2939         }
2940
2941         for (i = 0; i < nevents; i++) {
2942                 REQUIRE(events[i].data.fd < (int)manager->maxsocks);
2943 #ifdef ISC_PLATFORM_USETHREADS
2944                 if (events[i].data.fd == manager->pipe_fds[0]) {
2945                         have_ctlevent = ISC_TRUE;
2946                         continue;
2947                 }
2948 #endif
2949                 if ((events[i].events & EPOLLERR) != 0 ||
2950                     (events[i].events & EPOLLHUP) != 0) {
2951                         /*
2952                          * epoll does not set IN/OUT bits on an erroneous
2953                          * condition, so we need to try both anyway.  This is a
2954                          * bit inefficient, but should be okay for such rare
2955                          * events.  Note also that the read or write attempt
2956                          * won't block because we use non-blocking sockets.
2957                          */
2958                         events[i].events |= (EPOLLIN | EPOLLOUT);
2959                 }
2960                 process_fd(manager, events[i].data.fd,
2961                            (events[i].events & EPOLLIN) != 0,
2962                            (events[i].events & EPOLLOUT) != 0);
2963         }
2964
2965 #ifdef ISC_PLATFORM_USETHREADS
2966         if (have_ctlevent)
2967                 done = process_ctlfd(manager);
2968 #endif
2969
2970         return (done);
2971 }
2972 #elif defined(USE_DEVPOLL)
2973 static isc_boolean_t
2974 process_fds(isc_socketmgr_t *manager, struct pollfd *events, int nevents) {
2975         int i;
2976         isc_boolean_t done = ISC_FALSE;
2977 #ifdef ISC_PLATFORM_USETHREADS
2978         isc_boolean_t have_ctlevent = ISC_FALSE;
2979 #endif
2980
2981         if (nevents == manager->nevents) {
2982                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
2983                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
2984                             "maximum number of FD events (%d) received",
2985                             nevents);
2986         }
2987
2988         for (i = 0; i < nevents; i++) {
2989                 REQUIRE(events[i].fd < (int)manager->maxsocks);
2990 #ifdef ISC_PLATFORM_USETHREADS
2991                 if (events[i].fd == manager->pipe_fds[0]) {
2992                         have_ctlevent = ISC_TRUE;
2993                         continue;
2994                 }
2995 #endif
2996                 process_fd(manager, events[i].fd,
2997                            (events[i].events & POLLIN) != 0,
2998                            (events[i].events & POLLOUT) != 0);
2999         }
3000
3001 #ifdef ISC_PLATFORM_USETHREADS
3002         if (have_ctlevent)
3003                 done = process_ctlfd(manager);
3004 #endif
3005
3006         return (done);
3007 }
3008 #elif defined(USE_SELECT)
3009 static void
3010 process_fds(isc_socketmgr_t *manager, int maxfd,
3011             fd_set *readfds, fd_set *writefds)
3012 {
3013         int i;
3014
3015         REQUIRE(maxfd <= (int)manager->maxsocks);
3016
3017         for (i = 0; i < maxfd; i++) {
3018 #ifdef ISC_PLATFORM_USETHREADS
3019                 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
3020                         continue;
3021 #endif /* ISC_PLATFORM_USETHREADS */
3022                 process_fd(manager, i, FD_ISSET(i, readfds),
3023                            FD_ISSET(i, writefds));
3024         }
3025 }
3026 #endif
3027
3028 #ifdef ISC_PLATFORM_USETHREADS
3029 static isc_boolean_t
3030 process_ctlfd(isc_socketmgr_t *manager) {
3031         int msg, fd;
3032
3033         for (;;) {
3034                 select_readmsg(manager, &fd, &msg);
3035
3036                 manager_log(manager, IOEVENT,
3037                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
3038                                            ISC_MSG_WATCHERMSG,
3039                                            "watcher got message %d "
3040                                            "for socket %d"), msg, fd);
3041
3042                 /*
3043                  * Nothing to read?
3044                  */
3045                 if (msg == SELECT_POKE_NOTHING)
3046                         break;
3047
3048                 /*
3049                  * Handle shutdown message.  We really should
3050                  * jump out of this loop right away, but
3051                  * it doesn't matter if we have to do a little
3052                  * more work first.
3053                  */
3054                 if (msg == SELECT_POKE_SHUTDOWN)
3055                         return (ISC_TRUE);
3056
3057                 /*
3058                  * This is a wakeup on a socket.  Look
3059                  * at the event queue for both read and write,
3060                  * and decide if we need to watch on it now
3061                  * or not.
3062                  */
3063                 wakeup_socket(manager, fd, msg);
3064         }
3065
3066         return (ISC_FALSE);
3067 }
3068
3069 /*
3070  * This is the thread that will loop forever, always in a select or poll
3071  * call.
3072  *
3073  * When select returns something to do, track down what thread gets to do
3074  * this I/O and post the event to it.
3075  */
3076 static isc_threadresult_t
3077 watcher(void *uap) {
3078         isc_socketmgr_t *manager = uap;
3079         isc_boolean_t done;
3080         int ctlfd;
3081         int cc;
3082 #ifdef USE_KQUEUE
3083         const char *fnname = "kevent()";
3084 #elif defined (USE_EPOLL)
3085         const char *fnname = "epoll_wait()";
3086 #elif defined(USE_DEVPOLL)
3087         const char *fnname = "ioctl(DP_POLL)";
3088         struct dvpoll dvp;
3089 #elif defined (USE_SELECT)
3090         const char *fnname = "select()";
3091         int maxfd;
3092 #endif
3093         char strbuf[ISC_STRERRORSIZE];
3094 #ifdef ISC_SOCKET_USE_POLLWATCH
3095         pollstate_t pollstate = poll_idle;
3096 #endif
3097
3098         /*
3099          * Get the control fd here.  This will never change.
3100          */
3101         ctlfd = manager->pipe_fds[0];
3102         done = ISC_FALSE;
3103         while (!done) {
3104                 do {
3105 #ifdef USE_KQUEUE
3106                         cc = kevent(manager->kqueue_fd, NULL, 0,
3107                                     manager->events, manager->nevents, NULL);
3108 #elif defined(USE_EPOLL)
3109                         cc = epoll_wait(manager->epoll_fd, manager->events,
3110                                         manager->nevents, -1);
3111 #elif defined(USE_DEVPOLL)
3112                         dvp.dp_fds = manager->events;
3113                         dvp.dp_nfds = manager->nevents;
3114 #ifndef ISC_SOCKET_USE_POLLWATCH
3115                         dvp.dp_timeout = -1;
3116 #else
3117                         if (pollstate == poll_idle)
3118                                 dvp.dp_timeout = -1;
3119                         else
3120                                 dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT;
3121 #endif  /* ISC_SOCKET_USE_POLLWATCH */
3122                         cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
3123 #elif defined(USE_SELECT)
3124                         LOCK(&manager->lock);
3125                         memcpy(manager->read_fds_copy, manager->read_fds,
3126                                manager->fd_bufsize);
3127                         memcpy(manager->write_fds_copy, manager->write_fds,
3128                                manager->fd_bufsize);
3129                         maxfd = manager->maxfd + 1;
3130                         UNLOCK(&manager->lock);
3131
3132                         cc = select(maxfd, manager->read_fds_copy,
3133                                     manager->write_fds_copy, NULL, NULL);
3134 #endif  /* USE_KQUEUE */
3135
3136                         if (cc < 0 && !SOFT_ERROR(errno)) {
3137                                 isc__strerror(errno, strbuf, sizeof(strbuf));
3138                                 FATAL_ERROR(__FILE__, __LINE__,
3139                                             "%s %s: %s", fnname,
3140                                             isc_msgcat_get(isc_msgcat,
3141                                                            ISC_MSGSET_GENERAL,
3142                                                            ISC_MSG_FAILED,
3143                                                            "failed"), strbuf);
3144                         }
3145
3146 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3147                         if (cc == 0) {
3148                                 if (pollstate == poll_active)
3149                                         pollstate = poll_checking;
3150                                 else if (pollstate == poll_checking)
3151                                         pollstate = poll_idle;
3152                         } else if (cc > 0) {
3153                                 if (pollstate == poll_checking) {
3154                                         /*
3155                                          * XXX: We'd like to use a more
3156                                          * verbose log level as it's actually an
3157                                          * unexpected event, but the kernel bug
3158                                          * reportedly happens pretty frequently
3159                                          * (and it can also be a false positive)
3160                                          * so it would be just too noisy.
3161                                          */
3162                                         manager_log(manager,
3163                                                     ISC_LOGCATEGORY_GENERAL,
3164                                                     ISC_LOGMODULE_SOCKET,
3165                                                     ISC_LOG_DEBUG(1),
3166                                                     ISC_LOG_INFO,
3167                                                     "unexpected POLL timeout");
3168                                 }
3169                                 pollstate = poll_active;
3170                         }
3171 #endif
3172                 } while (cc < 0);
3173
3174 #if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL)
3175                 done = process_fds(manager, manager->events, cc);
3176 #elif defined(USE_SELECT)
3177                 process_fds(manager, maxfd, manager->read_fds_copy,
3178                             manager->write_fds_copy);
3179
3180                 /*
3181                  * Process reads on internal, control fd.
3182                  */
3183                 if (FD_ISSET(ctlfd, manager->read_fds_copy))
3184                         done = process_ctlfd(manager);
3185 #endif
3186         }
3187
3188         manager_log(manager, TRACE,
3189                     isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3190                                    ISC_MSG_EXITING, "watcher exiting"));
3191
3192         return ((isc_threadresult_t)0);
3193 }
3194 #endif /* ISC_PLATFORM_USETHREADS */
3195
3196 void
3197 isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
3198
3199         REQUIRE(VALID_MANAGER(manager));
3200
3201         manager->reserved = reserved;
3202 }
3203
3204 /*
3205  * Create a new socket manager.
3206  */
3207
3208 static isc_result_t
3209 setup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
3210         isc_result_t result;
3211
3212 #ifdef USE_KQUEUE
3213         manager->nevents = ISC_SOCKET_MAXEVENTS;
3214         manager->events = isc_mem_get(mctx, sizeof(struct kevent) *
3215                                       manager->nevents);
3216         if (manager->events == NULL)
3217                 return (ISC_R_NOMEMORY);
3218         manager->kqueue_fd = kqueue();
3219         if (manager->kqueue_fd == -1) {
3220                 result = isc__errno2result(errno);
3221                 isc_mem_put(mctx, manager->events,
3222                             sizeof(struct kevent) * manager->nevents);
3223                 return (result);
3224         }
3225
3226 #ifdef ISC_PLATFORM_USETHREADS
3227         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3228         if (result != ISC_R_SUCCESS) {
3229                 close(manager->kqueue_fd);
3230                 isc_mem_put(mctx, manager->events,
3231                             sizeof(struct kevent) * manager->nevents);
3232                 return (result);
3233         }
3234 #endif  /* ISC_PLATFORM_USETHREADS */
3235 #elif defined(USE_EPOLL)
3236         manager->nevents = ISC_SOCKET_MAXEVENTS;
3237         manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) *
3238                                       manager->nevents);
3239         if (manager->events == NULL)
3240                 return (ISC_R_NOMEMORY);
3241         manager->epoll_fd = epoll_create(manager->nevents);
3242         if (manager->epoll_fd == -1) {
3243                 result = isc__errno2result(errno);
3244                 isc_mem_put(mctx, manager->events,
3245                             sizeof(struct epoll_event) * manager->nevents);
3246                 return (result);
3247         }
3248 #ifdef ISC_PLATFORM_USETHREADS
3249         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3250         if (result != ISC_R_SUCCESS) {
3251                 close(manager->epoll_fd);
3252                 isc_mem_put(mctx, manager->events,
3253                             sizeof(struct epoll_event) * manager->nevents);
3254                 return (result);
3255         }
3256 #endif  /* ISC_PLATFORM_USETHREADS */
3257 #elif defined(USE_DEVPOLL)
3258         /*
3259          * XXXJT: /dev/poll seems to reject large numbers of events,
3260          * so we should be careful about redefining ISC_SOCKET_MAXEVENTS.
3261          */
3262         manager->nevents = ISC_SOCKET_MAXEVENTS;
3263         manager->events = isc_mem_get(mctx, sizeof(struct pollfd) *
3264                                       manager->nevents);
3265         if (manager->events == NULL)
3266                 return (ISC_R_NOMEMORY);
3267         /*
3268          * Note: fdpollinfo should be able to support all possible FDs, so
3269          * it must have maxsocks entries (not nevents).
3270          */
3271         manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) *
3272                                           manager->maxsocks);
3273         if (manager->fdpollinfo == NULL) {
3274                 isc_mem_put(mctx, manager->events,
3275                             sizeof(pollinfo_t) * manager->maxsocks);
3276                 return (ISC_R_NOMEMORY);
3277         }
3278         memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks);
3279         manager->devpoll_fd = open("/dev/poll", O_RDWR);
3280         if (manager->devpoll_fd == -1) {
3281                 result = isc__errno2result(errno);
3282                 isc_mem_put(mctx, manager->events,
3283                             sizeof(struct pollfd) * manager->nevents);
3284                 isc_mem_put(mctx, manager->fdpollinfo,
3285                             sizeof(pollinfo_t) * manager->maxsocks);
3286                 return (result);
3287         }
3288 #ifdef ISC_PLATFORM_USETHREADS
3289         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3290         if (result != ISC_R_SUCCESS) {
3291                 close(manager->devpoll_fd);
3292                 isc_mem_put(mctx, manager->events,
3293                             sizeof(struct pollfd) * manager->nevents);
3294                 isc_mem_put(mctx, manager->fdpollinfo,
3295                             sizeof(pollinfo_t) * manager->maxsocks);
3296                 return (result);
3297         }
3298 #endif  /* ISC_PLATFORM_USETHREADS */
3299 #elif defined(USE_SELECT)
3300         UNUSED(result);
3301
3302 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3303         /*
3304          * Note: this code should also cover the case of MAXSOCKETS <=
3305          * FD_SETSIZE, but we separate the cases to avoid possible portability
3306          * issues regarding howmany() and the actual representation of fd_set.
3307          */
3308         manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
3309                 sizeof(fd_mask);
3310 #else
3311         manager->fd_bufsize = sizeof(fd_set);
3312 #endif
3313
3314         manager->read_fds = NULL;
3315         manager->read_fds_copy = NULL;
3316         manager->write_fds = NULL;
3317         manager->write_fds_copy = NULL;
3318
3319         manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
3320         if (manager->read_fds != NULL)
3321                 manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
3322         if (manager->read_fds_copy != NULL)
3323                 manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
3324         if (manager->write_fds != NULL) {
3325                 manager->write_fds_copy = isc_mem_get(mctx,
3326                                                       manager->fd_bufsize);
3327         }
3328         if (manager->write_fds_copy == NULL) {
3329                 if (manager->write_fds != NULL) {
3330                         isc_mem_put(mctx, manager->write_fds,
3331                                     manager->fd_bufsize);
3332                 }
3333                 if (manager->read_fds_copy != NULL) {
3334                         isc_mem_put(mctx, manager->read_fds_copy,
3335                                     manager->fd_bufsize);
3336                 }
3337                 if (manager->read_fds != NULL) {
3338                         isc_mem_put(mctx, manager->read_fds,
3339                                     manager->fd_bufsize);
3340                 }
3341                 return (ISC_R_NOMEMORY);
3342         }
3343         memset(manager->read_fds, 0, manager->fd_bufsize);
3344         memset(manager->write_fds, 0, manager->fd_bufsize);
3345
3346 #ifdef ISC_PLATFORM_USETHREADS
3347         (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3348         manager->maxfd = manager->pipe_fds[0];
3349 #else /* ISC_PLATFORM_USETHREADS */
3350         manager->maxfd = 0;
3351 #endif /* ISC_PLATFORM_USETHREADS */
3352 #endif  /* USE_KQUEUE */
3353
3354         return (ISC_R_SUCCESS);
3355 }
3356
3357 static void
3358 cleanup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
3359 #ifdef ISC_PLATFORM_USETHREADS
3360         isc_result_t result;
3361
3362         result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3363         if (result != ISC_R_SUCCESS) {
3364                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3365                                  "epoll_ctl(DEL) %s",
3366                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3367                                                 ISC_MSG_FAILED, "failed"));
3368         }
3369 #endif  /* ISC_PLATFORM_USETHREADS */
3370
3371 #ifdef USE_KQUEUE
3372         close(manager->kqueue_fd);
3373         isc_mem_put(mctx, manager->events,
3374                     sizeof(struct kevent) * manager->nevents);
3375 #elif defined(USE_EPOLL)
3376         close(manager->epoll_fd);
3377         isc_mem_put(mctx, manager->events,
3378                     sizeof(struct epoll_event) * manager->nevents);
3379 #elif defined(USE_DEVPOLL)
3380         close(manager->devpoll_fd);
3381         isc_mem_put(mctx, manager->events,
3382                     sizeof(struct pollfd) * manager->nevents);
3383         isc_mem_put(mctx, manager->fdpollinfo,
3384                     sizeof(pollinfo_t) * manager->maxsocks);
3385 #elif defined(USE_SELECT)
3386         if (manager->read_fds != NULL)
3387                 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
3388         if (manager->read_fds_copy != NULL)
3389                 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize);
3390         if (manager->write_fds != NULL)
3391                 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
3392         if (manager->write_fds_copy != NULL)
3393                 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
3394 #endif  /* USE_KQUEUE */
3395 }
3396
3397 isc_result_t
3398 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
3399         return (isc_socketmgr_create2(mctx, managerp, 0));
3400 }
3401
3402 isc_result_t
3403 isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
3404                       unsigned int maxsocks)
3405 {
3406         int i;
3407         isc_socketmgr_t *manager;
3408 #ifdef ISC_PLATFORM_USETHREADS
3409         char strbuf[ISC_STRERRORSIZE];
3410 #endif
3411         isc_result_t result;
3412
3413         REQUIRE(managerp != NULL && *managerp == NULL);
3414
3415 #ifndef ISC_PLATFORM_USETHREADS
3416         if (socketmgr != NULL) {
3417                 /* Don't allow maxsocks to be updated */
3418                 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
3419                         return (ISC_R_EXISTS);
3420
3421                 socketmgr->refs++;
3422                 *managerp = socketmgr;
3423                 return (ISC_R_SUCCESS);
3424         }
3425 #endif /* ISC_PLATFORM_USETHREADS */
3426
3427         if (maxsocks == 0)
3428                 maxsocks = ISC_SOCKET_MAXSOCKETS;
3429
3430         manager = isc_mem_get(mctx, sizeof(*manager));
3431         if (manager == NULL)
3432                 return (ISC_R_NOMEMORY);
3433
3434         /* zero-clear so that necessary cleanup on failure will be easy */
3435         memset(manager, 0, sizeof(*manager));
3436         manager->maxsocks = maxsocks;
3437         manager->reserved = 0;
3438         manager->fds = isc_mem_get(mctx,
3439                                    manager->maxsocks * sizeof(isc_socket_t *));
3440         if (manager->fds == NULL) {
3441                 result = ISC_R_NOMEMORY;
3442                 goto free_manager;
3443         }
3444         manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int));
3445         if (manager->fds == NULL) {
3446                 result = ISC_R_NOMEMORY;
3447                 goto free_manager;
3448         }
3449
3450         manager->magic = SOCKET_MANAGER_MAGIC;
3451         manager->mctx = NULL;
3452         memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
3453         ISC_LIST_INIT(manager->socklist);
3454         result = isc_mutex_init(&manager->lock);
3455         if (result != ISC_R_SUCCESS)
3456                 goto free_manager;
3457         manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t));
3458         if (manager->fdlock == NULL) {
3459                 result = ISC_R_NOMEMORY;
3460                 goto cleanup_lock;
3461         }
3462         for (i = 0; i < FDLOCK_COUNT; i++) {
3463                 result = isc_mutex_init(&manager->fdlock[i]);
3464                 if (result != ISC_R_SUCCESS) {
3465                         while (--i >= 0)
3466                                 DESTROYLOCK(&manager->fdlock[i]);
3467                         isc_mem_put(mctx, manager->fdlock,
3468                                     FDLOCK_COUNT * sizeof(isc_mutex_t));
3469                         manager->fdlock = NULL;
3470                         goto cleanup_lock;
3471                 }
3472         }
3473
3474 #ifdef ISC_PLATFORM_USETHREADS
3475         if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
3476                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3477                                  "isc_condition_init() %s",
3478                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3479                                                 ISC_MSG_FAILED, "failed"));
3480                 result = ISC_R_UNEXPECTED;
3481                 goto cleanup_lock;
3482         }
3483
3484         /*
3485          * Create the special fds that will be used to wake up the
3486          * select/poll loop when something internal needs to be done.
3487          */
3488         if (pipe(manager->pipe_fds) != 0) {
3489                 isc__strerror(errno, strbuf, sizeof(strbuf));
3490                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3491                                  "pipe() %s: %s",
3492                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3493                                                 ISC_MSG_FAILED, "failed"),
3494                                  strbuf);
3495                 result = ISC_R_UNEXPECTED;
3496                 goto cleanup_condition;
3497         }
3498
3499         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
3500 #if 0
3501         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
3502 #endif
3503 #else /* ISC_PLATFORM_USETHREADS */
3504         manager->refs = 1;
3505 #endif /* ISC_PLATFORM_USETHREADS */
3506
3507         /*
3508          * Set up initial state for the select loop
3509          */
3510         result = setup_watcher(mctx, manager);
3511         if (result != ISC_R_SUCCESS)
3512                 goto cleanup;
3513         memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
3514 #ifdef ISC_PLATFORM_USETHREADS
3515         /*
3516          * Start up the select/poll thread.
3517          */
3518         if (isc_thread_create(watcher, manager, &manager->watcher) !=
3519             ISC_R_SUCCESS) {
3520                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3521                                  "isc_thread_create() %s",
3522                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3523                                                 ISC_MSG_FAILED, "failed"));
3524                 cleanup_watcher(mctx, manager);
3525                 result = ISC_R_UNEXPECTED;
3526                 goto cleanup;
3527         }
3528 #endif /* ISC_PLATFORM_USETHREADS */
3529         isc_mem_attach(mctx, &manager->mctx);
3530
3531 #ifndef ISC_PLATFORM_USETHREADS
3532         socketmgr = manager;
3533 #endif /* ISC_PLATFORM_USETHREADS */
3534         *managerp = manager;
3535
3536         return (ISC_R_SUCCESS);
3537
3538 cleanup:
3539 #ifdef ISC_PLATFORM_USETHREADS
3540         (void)close(manager->pipe_fds[0]);
3541         (void)close(manager->pipe_fds[1]);
3542 #endif  /* ISC_PLATFORM_USETHREADS */
3543
3544 #ifdef ISC_PLATFORM_USETHREADS
3545 cleanup_condition:
3546         (void)isc_condition_destroy(&manager->shutdown_ok);
3547 #endif  /* ISC_PLATFORM_USETHREADS */
3548
3549
3550 cleanup_lock:
3551         if (manager->fdlock != NULL) {
3552                 for (i = 0; i < FDLOCK_COUNT; i++)
3553                         DESTROYLOCK(&manager->fdlock[i]);
3554         }
3555         DESTROYLOCK(&manager->lock);
3556
3557 free_manager:
3558         if (manager->fdlock != NULL) {
3559                 isc_mem_put(mctx, manager->fdlock,
3560                             FDLOCK_COUNT * sizeof(isc_mutex_t));
3561         }
3562         if (manager->fdstate != NULL) {
3563                 isc_mem_put(mctx, manager->fdstate,
3564                             manager->maxsocks * sizeof(int));
3565         }
3566         if (manager->fds != NULL) {
3567                 isc_mem_put(mctx, manager->fds,
3568                             manager->maxsocks * sizeof(isc_socket_t *));
3569         }
3570         isc_mem_put(mctx, manager, sizeof(*manager));
3571
3572         return (result);
3573 }
3574
3575 isc_result_t
3576 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
3577         REQUIRE(VALID_MANAGER(manager));
3578         REQUIRE(nsockp != NULL);
3579
3580         *nsockp = manager->maxsocks;
3581
3582         return (ISC_R_SUCCESS);
3583 }
3584
3585 void
3586 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
3587         isc_socketmgr_t *manager;
3588         int i;
3589         isc_mem_t *mctx;
3590
3591         /*
3592          * Destroy a socket manager.
3593          */
3594
3595         REQUIRE(managerp != NULL);
3596         manager = *managerp;
3597         REQUIRE(VALID_MANAGER(manager));
3598
3599 #ifndef ISC_PLATFORM_USETHREADS
3600         if (manager->refs > 1) {
3601                 manager->refs--;
3602                 *managerp = NULL;
3603                 return;
3604         }
3605 #endif /* ISC_PLATFORM_USETHREADS */
3606
3607         LOCK(&manager->lock);
3608
3609 #ifdef ISC_PLATFORM_USETHREADS
3610         /*
3611          * Wait for all sockets to be destroyed.
3612          */
3613         while (!ISC_LIST_EMPTY(manager->socklist)) {
3614                 manager_log(manager, CREATION,
3615                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
3616                                            ISC_MSG_SOCKETSREMAIN,
3617                                            "sockets exist"));
3618                 WAIT(&manager->shutdown_ok, &manager->lock);
3619         }
3620 #else /* ISC_PLATFORM_USETHREADS */
3621         /*
3622          * Hope all sockets have been destroyed.
3623          */
3624         if (!ISC_LIST_EMPTY(manager->socklist)) {
3625                 manager_log(manager, CREATION,
3626                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
3627                                            ISC_MSG_SOCKETSREMAIN,
3628                                            "sockets exist"));
3629                 INSIST(0);
3630         }
3631 #endif /* ISC_PLATFORM_USETHREADS */
3632
3633         UNLOCK(&manager->lock);
3634
3635         /*
3636          * Here, poke our select/poll thread.  Do this by closing the write
3637          * half of the pipe, which will send EOF to the read half.
3638          * This is currently a no-op in the non-threaded case.
3639          */
3640         select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
3641
3642 #ifdef ISC_PLATFORM_USETHREADS
3643         /*
3644          * Wait for thread to exit.
3645          */
3646         if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
3647                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3648                                  "isc_thread_join() %s",
3649                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3650                                                 ISC_MSG_FAILED, "failed"));
3651 #endif /* ISC_PLATFORM_USETHREADS */
3652
3653         /*
3654          * Clean up.
3655          */
3656         cleanup_watcher(manager->mctx, manager);
3657
3658 #ifdef ISC_PLATFORM_USETHREADS
3659         (void)close(manager->pipe_fds[0]);
3660         (void)close(manager->pipe_fds[1]);
3661         (void)isc_condition_destroy(&manager->shutdown_ok);
3662 #endif /* ISC_PLATFORM_USETHREADS */
3663
3664         for (i = 0; i < (int)manager->maxsocks; i++)
3665                 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
3666                         (void)close(i);
3667
3668         isc_mem_put(manager->mctx, manager->fds,
3669                     manager->maxsocks * sizeof(isc_socket_t *));
3670         isc_mem_put(manager->mctx, manager->fdstate,
3671                     manager->maxsocks * sizeof(int));
3672
3673         if (manager->fdlock != NULL) {
3674                 for (i = 0; i < FDLOCK_COUNT; i++)
3675                         DESTROYLOCK(&manager->fdlock[i]);
3676                 isc_mem_put(manager->mctx, manager->fdlock,
3677                             FDLOCK_COUNT * sizeof(isc_mutex_t));
3678         }
3679         DESTROYLOCK(&manager->lock);
3680         manager->magic = 0;
3681         mctx= manager->mctx;
3682         isc_mem_put(mctx, manager, sizeof(*manager));
3683
3684         isc_mem_detach(&mctx);
3685
3686         *managerp = NULL;
3687 }
3688
3689 static isc_result_t
3690 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
3691             unsigned int flags)
3692 {
3693         int io_state;
3694         isc_boolean_t have_lock = ISC_FALSE;
3695         isc_task_t *ntask = NULL;
3696         isc_result_t result = ISC_R_SUCCESS;
3697
3698         dev->ev_sender = task;
3699
3700         if (sock->type == isc_sockettype_udp) {
3701                 io_state = doio_recv(sock, dev);
3702         } else {
3703                 LOCK(&sock->lock);
3704                 have_lock = ISC_TRUE;
3705
3706                 if (ISC_LIST_EMPTY(sock->recv_list))
3707                         io_state = doio_recv(sock, dev);
3708                 else
3709                         io_state = DOIO_SOFT;
3710         }
3711
3712         switch (io_state) {
3713         case DOIO_SOFT:
3714                 /*
3715                  * We couldn't read all or part of the request right now, so
3716                  * queue it.
3717                  *
3718                  * Attach to socket and to task
3719                  */
3720                 isc_task_attach(task, &ntask);
3721                 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
3722
3723                 if (!have_lock) {
3724                         LOCK(&sock->lock);
3725                         have_lock = ISC_TRUE;
3726                 }
3727
3728                 /*
3729                  * Enqueue the request.  If the socket was previously not being
3730                  * watched, poke the watcher to start paying attention to it.
3731                  */
3732                 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
3733                         select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3734                 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
3735
3736                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
3737                            "socket_recv: event %p -> task %p",
3738                            dev, ntask);
3739
3740                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
3741                         result = ISC_R_INPROGRESS;
3742                 break;
3743
3744         case DOIO_EOF:
3745                 dev->result = ISC_R_EOF;
3746                 /* fallthrough */
3747
3748         case DOIO_HARD:
3749         case DOIO_SUCCESS:
3750                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
3751                         send_recvdone_event(sock, &dev);
3752                 break;
3753         }
3754
3755         if (have_lock)
3756                 UNLOCK(&sock->lock);
3757
3758         return (result);
3759 }
3760
3761 isc_result_t
3762 isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
3763                  unsigned int minimum, isc_task_t *task,
3764                  isc_taskaction_t action, const void *arg)
3765 {
3766         isc_socketevent_t *dev;
3767         isc_socketmgr_t *manager;
3768         unsigned int iocount;
3769         isc_buffer_t *buffer;
3770
3771         REQUIRE(VALID_SOCKET(sock));
3772         REQUIRE(buflist != NULL);
3773         REQUIRE(!ISC_LIST_EMPTY(*buflist));
3774         REQUIRE(task != NULL);
3775         REQUIRE(action != NULL);
3776
3777         manager = sock->manager;
3778         REQUIRE(VALID_MANAGER(manager));
3779
3780         iocount = isc_bufferlist_availablecount(buflist);
3781         REQUIRE(iocount > 0);
3782
3783         INSIST(sock->bound);
3784
3785         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
3786         if (dev == NULL) {
3787                 return (ISC_R_NOMEMORY);
3788         }
3789
3790         /*
3791          * UDP sockets are always partial read
3792          */
3793         if (sock->type == isc_sockettype_udp)
3794                 dev->minimum = 1;
3795         else {
3796                 if (minimum == 0)
3797                         dev->minimum = iocount;
3798                 else
3799                         dev->minimum = minimum;
3800         }
3801
3802         /*
3803          * Move each buffer from the passed in list to our internal one.
3804          */
3805         buffer = ISC_LIST_HEAD(*buflist);
3806         while (buffer != NULL) {
3807                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
3808                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
3809                 buffer = ISC_LIST_HEAD(*buflist);
3810         }
3811
3812         return (socket_recv(sock, dev, task, 0));
3813 }
3814
3815 isc_result_t
3816 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
3817                 isc_task_t *task, isc_taskaction_t action, const void *arg)
3818 {
3819         isc_socketevent_t *dev;
3820         isc_socketmgr_t *manager;
3821
3822         REQUIRE(VALID_SOCKET(sock));
3823         REQUIRE(action != NULL);
3824
3825         manager = sock->manager;
3826         REQUIRE(VALID_MANAGER(manager));
3827
3828         INSIST(sock->bound);
3829
3830         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
3831         if (dev == NULL)
3832                 return (ISC_R_NOMEMORY);
3833
3834         return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
3835 }
3836
3837 isc_result_t
3838 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
3839                  unsigned int minimum, isc_task_t *task,
3840                  isc_socketevent_t *event, unsigned int flags)
3841 {
3842         event->ev_sender = sock;
3843         event->result = ISC_R_UNEXPECTED;
3844         ISC_LIST_INIT(event->bufferlist);
3845         event->region = *region;
3846         event->n = 0;
3847         event->offset = 0;
3848         event->attributes = 0;
3849
3850         /*
3851          * UDP sockets are always partial read.
3852          */
3853         if (sock->type == isc_sockettype_udp)
3854                 event->minimum = 1;
3855         else {
3856                 if (minimum == 0)
3857                         event->minimum = region->length;
3858                 else
3859                         event->minimum = minimum;
3860         }
3861
3862         return (socket_recv(sock, event, task, flags));
3863 }
3864
3865 static isc_result_t
3866 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
3867             isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
3868             unsigned int flags)
3869 {
3870         int io_state;
3871         isc_boolean_t have_lock = ISC_FALSE;
3872         isc_task_t *ntask = NULL;
3873         isc_result_t result = ISC_R_SUCCESS;
3874
3875         dev->ev_sender = task;
3876
3877         set_dev_address(address, sock, dev);
3878         if (pktinfo != NULL) {
3879                 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
3880                 dev->pktinfo = *pktinfo;
3881
3882                 if (!isc_sockaddr_issitelocal(&dev->address) &&
3883                     !isc_sockaddr_islinklocal(&dev->address)) {
3884                         socket_log(sock, NULL, TRACE, isc_msgcat,
3885                                    ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
3886                                    "pktinfo structure provided, ifindex %u "
3887                                    "(set to 0)", pktinfo->ipi6_ifindex);
3888
3889                         /*
3890                          * Set the pktinfo index to 0 here, to let the
3891                          * kernel decide what interface it should send on.
3892                          */
3893                         dev->pktinfo.ipi6_ifindex = 0;
3894                 }
3895         }
3896
3897         if (sock->type == isc_sockettype_udp)
3898                 io_state = doio_send(sock, dev);
3899         else {
3900                 LOCK(&sock->lock);
3901                 have_lock = ISC_TRUE;
3902
3903                 if (ISC_LIST_EMPTY(sock->send_list))
3904                         io_state = doio_send(sock, dev);
3905                 else
3906                         io_state = DOIO_SOFT;
3907         }
3908
3909         switch (io_state) {
3910         case DOIO_SOFT:
3911                 /*
3912                  * We couldn't send all or part of the request right now, so
3913                  * queue it unless ISC_SOCKFLAG_NORETRY is set.
3914                  */
3915                 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
3916                         isc_task_attach(task, &ntask);
3917                         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
3918
3919                         if (!have_lock) {
3920                                 LOCK(&sock->lock);
3921                                 have_lock = ISC_TRUE;
3922                         }
3923
3924                         /*
3925                          * Enqueue the request.  If the socket was previously
3926                          * not being watched, poke the watcher to start
3927                          * paying attention to it.
3928                          */
3929                         if (ISC_LIST_EMPTY(sock->send_list) &&
3930                             !sock->pending_send)
3931                                 select_poke(sock->manager, sock->fd,
3932                                             SELECT_POKE_WRITE);
3933                         ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
3934
3935                         socket_log(sock, NULL, EVENT, NULL, 0, 0,
3936                                    "socket_send: event %p -> task %p",
3937                                    dev, ntask);
3938
3939                         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
3940                                 result = ISC_R_INPROGRESS;
3941                         break;
3942                 }
3943
3944         case DOIO_HARD:
3945         case DOIO_SUCCESS:
3946                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
3947                         send_senddone_event(sock, &dev);
3948                 break;
3949         }
3950
3951         if (have_lock)
3952                 UNLOCK(&sock->lock);
3953
3954         return (result);
3955 }
3956
3957 isc_result_t
3958 isc_socket_send(isc_socket_t *sock, isc_region_t *region,
3959                 isc_task_t *task, isc_taskaction_t action, const void *arg)
3960 {
3961         /*
3962          * REQUIRE() checking is performed in isc_socket_sendto().
3963          */
3964         return (isc_socket_sendto(sock, region, task, action, arg, NULL,
3965                                   NULL));
3966 }
3967
3968 isc_result_t
3969 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
3970                   isc_task_t *task, isc_taskaction_t action, const void *arg,
3971                   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3972 {
3973         isc_socketevent_t *dev;
3974         isc_socketmgr_t *manager;
3975
3976         REQUIRE(VALID_SOCKET(sock));
3977         REQUIRE(region != NULL);
3978         REQUIRE(task != NULL);
3979         REQUIRE(action != NULL);
3980
3981         manager = sock->manager;
3982         REQUIRE(VALID_MANAGER(manager));
3983
3984         INSIST(sock->bound);
3985
3986         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3987         if (dev == NULL) {
3988                 return (ISC_R_NOMEMORY);
3989         }
3990
3991         dev->region = *region;
3992
3993         return (socket_send(sock, dev, task, address, pktinfo, 0));
3994 }
3995
3996 isc_result_t
3997 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
3998                  isc_task_t *task, isc_taskaction_t action, const void *arg)
3999 {
4000         return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
4001                                    NULL));
4002 }
4003
4004 isc_result_t
4005 isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
4006                    isc_task_t *task, isc_taskaction_t action, const void *arg,
4007                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4008 {
4009         isc_socketevent_t *dev;
4010         isc_socketmgr_t *manager;
4011         unsigned int iocount;
4012         isc_buffer_t *buffer;
4013
4014         REQUIRE(VALID_SOCKET(sock));
4015         REQUIRE(buflist != NULL);
4016         REQUIRE(!ISC_LIST_EMPTY(*buflist));
4017         REQUIRE(task != NULL);
4018         REQUIRE(action != NULL);
4019
4020         manager = sock->manager;
4021         REQUIRE(VALID_MANAGER(manager));
4022
4023         iocount = isc_bufferlist_usedcount(buflist);
4024         REQUIRE(iocount > 0);
4025
4026         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4027         if (dev == NULL) {
4028                 return (ISC_R_NOMEMORY);
4029         }
4030
4031         /*
4032          * Move each buffer from the passed in list to our internal one.
4033          */
4034         buffer = ISC_LIST_HEAD(*buflist);
4035         while (buffer != NULL) {
4036                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
4037                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4038                 buffer = ISC_LIST_HEAD(*buflist);
4039         }
4040
4041         return (socket_send(sock, dev, task, address, pktinfo, 0));
4042 }
4043
4044 isc_result_t
4045 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
4046                    isc_task_t *task,
4047                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4048                    isc_socketevent_t *event, unsigned int flags)
4049 {
4050         REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
4051         if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
4052                 REQUIRE(sock->type == isc_sockettype_udp);
4053         event->ev_sender = sock;
4054         event->result = ISC_R_UNEXPECTED;
4055         ISC_LIST_INIT(event->bufferlist);
4056         event->region = *region;
4057         event->n = 0;
4058         event->offset = 0;
4059         event->attributes = 0;
4060
4061         return (socket_send(sock, event, task, address, pktinfo, flags));
4062 }
4063
4064 void
4065 isc_socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
4066 #ifdef ISC_PLATFORM_HAVESYSUNH
4067         int s;
4068         struct stat sb;
4069         char strbuf[ISC_STRERRORSIZE];
4070
4071         if (sockaddr->type.sa.sa_family != AF_UNIX)
4072                 return;
4073
4074 #ifndef S_ISSOCK
4075 #if defined(S_IFMT) && defined(S_IFSOCK)
4076 #define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
4077 #elif defined(_S_IFMT) && defined(S_IFSOCK)
4078 #define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
4079 #endif
4080 #endif
4081
4082 #ifndef S_ISFIFO
4083 #if defined(S_IFMT) && defined(S_IFIFO)
4084 #define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
4085 #elif defined(_S_IFMT) && defined(S_IFIFO)
4086 #define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
4087 #endif
4088 #endif
4089
4090 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4091 #error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4092 #endif
4093
4094 #ifndef S_ISFIFO
4095 #define S_ISFIFO(mode) 0
4096 #endif
4097
4098 #ifndef S_ISSOCK
4099 #define S_ISSOCK(mode) 0
4100 #endif
4101
4102         if (active) {
4103                 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4104                         isc__strerror(errno, strbuf, sizeof(strbuf));
4105                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4106                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4107                                       "isc_socket_cleanunix: stat(%s): %s",
4108                                       sockaddr->type.sunix.sun_path, strbuf);
4109                         return;
4110                 }
4111                 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4112                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4113                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4114                                       "isc_socket_cleanunix: %s: not a socket",
4115                                       sockaddr->type.sunix.sun_path);
4116                         return;
4117                 }
4118                 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4119                         isc__strerror(errno, strbuf, sizeof(strbuf));
4120                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4121                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4122                                       "isc_socket_cleanunix: unlink(%s): %s",
4123                                       sockaddr->type.sunix.sun_path, strbuf);
4124                 }
4125                 return;
4126         }
4127
4128         s = socket(AF_UNIX, SOCK_STREAM, 0);
4129         if (s < 0) {
4130                 isc__strerror(errno, strbuf, sizeof(strbuf));
4131                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4132                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4133                               "isc_socket_cleanunix: socket(%s): %s",
4134                               sockaddr->type.sunix.sun_path, strbuf);
4135                 return;
4136         }
4137
4138         if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4139                 switch (errno) {
4140                 case ENOENT:    /* We exited cleanly last time */
4141                         break;
4142                 default:
4143                         isc__strerror(errno, strbuf, sizeof(strbuf));
4144                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4145                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4146                                       "isc_socket_cleanunix: stat(%s): %s",
4147                                       sockaddr->type.sunix.sun_path, strbuf);
4148                         break;
4149                 }
4150                 goto cleanup;
4151         }
4152
4153         if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4154                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4155                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4156                               "isc_socket_cleanunix: %s: not a socket",
4157                               sockaddr->type.sunix.sun_path);
4158                 goto cleanup;
4159         }
4160
4161         if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
4162                     sizeof(sockaddr->type.sunix)) < 0) {
4163                 switch (errno) {
4164                 case ECONNREFUSED:
4165                 case ECONNRESET:
4166                         if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4167                                 isc__strerror(errno, strbuf, sizeof(strbuf));
4168                                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4169                                               ISC_LOGMODULE_SOCKET,
4170                                               ISC_LOG_WARNING,
4171                                               "isc_socket_cleanunix: "
4172                                               "unlink(%s): %s",
4173                                               sockaddr->type.sunix.sun_path,
4174                                               strbuf);
4175                         }
4176                         break;
4177                 default:
4178                         isc__strerror(errno, strbuf, sizeof(strbuf));
4179                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4180                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4181                                       "isc_socket_cleanunix: connect(%s): %s",
4182                                       sockaddr->type.sunix.sun_path, strbuf);
4183                         break;
4184                 }
4185         }
4186  cleanup:
4187         close(s);
4188 #else
4189         UNUSED(sockaddr);
4190         UNUSED(active);
4191 #endif
4192 }
4193
4194 isc_result_t
4195 isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
4196                     isc_uint32_t owner, isc_uint32_t group)
4197 {
4198 #ifdef ISC_PLATFORM_HAVESYSUNH
4199         isc_result_t result = ISC_R_SUCCESS;
4200         char strbuf[ISC_STRERRORSIZE];
4201         char path[sizeof(sockaddr->type.sunix.sun_path)];
4202 #ifdef NEED_SECURE_DIRECTORY
4203         char *slash;
4204 #endif
4205
4206         REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4207         INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4208         strcpy(path, sockaddr->type.sunix.sun_path);
4209
4210 #ifdef NEED_SECURE_DIRECTORY
4211         slash = strrchr(path, '/');
4212         if (slash != NULL) {
4213                 if (slash != path)
4214                         *slash = '\0';
4215                 else
4216                         strcpy(path, "/");
4217         } else
4218                 strcpy(path, ".");
4219 #endif
4220
4221         if (chmod(path, perm) < 0) {
4222                 isc__strerror(errno, strbuf, sizeof(strbuf));
4223                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4224                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4225                               "isc_socket_permunix: chmod(%s, %d): %s",
4226                               path, perm, strbuf);
4227                 result = ISC_R_FAILURE;
4228         }
4229         if (chown(path, owner, group) < 0) {
4230                 isc__strerror(errno, strbuf, sizeof(strbuf));
4231                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4232                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4233                               "isc_socket_permunix: chown(%s, %d, %d): %s",
4234                               path, owner, group,
4235                               strbuf);
4236                 result = ISC_R_FAILURE;
4237         }
4238         return (result);
4239 #else
4240         UNUSED(sockaddr);
4241         UNUSED(perm);
4242         UNUSED(owner);
4243         UNUSED(group);
4244         return (ISC_R_NOTIMPLEMENTED);
4245 #endif
4246 }
4247
4248 isc_result_t
4249 isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
4250                 unsigned int options) {
4251         char strbuf[ISC_STRERRORSIZE];
4252         int on = 1;
4253
4254         LOCK(&sock->lock);
4255
4256         INSIST(!sock->bound);
4257
4258         if (sock->pf != sockaddr->type.sa.sa_family) {
4259                 UNLOCK(&sock->lock);
4260                 return (ISC_R_FAMILYMISMATCH);
4261         }
4262         /*
4263          * Only set SO_REUSEADDR when we want a specific port.
4264          */
4265 #ifdef AF_UNIX
4266         if (sock->pf == AF_UNIX)
4267                 goto bind_socket;
4268 #endif
4269         if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4270             isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
4271             setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4272                        sizeof(on)) < 0) {
4273                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4274                                  "setsockopt(%d) %s", sock->fd,
4275                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4276                                                 ISC_MSG_FAILED, "failed"));
4277                 /* Press on... */
4278         }
4279 #ifdef AF_UNIX
4280  bind_socket:
4281 #endif
4282         if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
4283                 UNLOCK(&sock->lock);
4284                 switch (errno) {
4285                 case EACCES:
4286                         return (ISC_R_NOPERM);
4287                 case EADDRNOTAVAIL:
4288                         return (ISC_R_ADDRNOTAVAIL);
4289                 case EADDRINUSE:
4290                         return (ISC_R_ADDRINUSE);
4291                 case EINVAL:
4292                         return (ISC_R_BOUND);
4293                 default:
4294                         isc__strerror(errno, strbuf, sizeof(strbuf));
4295                         UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
4296                                          strbuf);
4297                         return (ISC_R_UNEXPECTED);
4298                 }
4299         }
4300
4301         socket_log(sock, sockaddr, TRACE,
4302                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
4303         sock->bound = 1;
4304
4305         UNLOCK(&sock->lock);
4306         return (ISC_R_SUCCESS);
4307 }
4308
4309 isc_result_t
4310 isc_socket_filter(isc_socket_t *sock, const char *filter) {
4311 #ifdef SO_ACCEPTFILTER
4312         char strbuf[ISC_STRERRORSIZE];
4313         struct accept_filter_arg afa;
4314 #else
4315         UNUSED(sock);
4316         UNUSED(filter);
4317 #endif
4318
4319         REQUIRE(VALID_SOCKET(sock));
4320
4321 #ifdef SO_ACCEPTFILTER
4322         bzero(&afa, sizeof(afa));
4323         strncpy(afa.af_name, filter, sizeof(afa.af_name));
4324         if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
4325                          &afa, sizeof(afa)) == -1) {
4326                 isc__strerror(errno, strbuf, sizeof(strbuf));
4327                 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
4328                            ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
4329                            strbuf);
4330                 return (ISC_R_FAILURE);
4331         }
4332         return (ISC_R_SUCCESS);
4333 #else
4334         return (ISC_R_NOTIMPLEMENTED);
4335 #endif
4336 }
4337
4338 /*
4339  * Set up to listen on a given socket.  We do this by creating an internal
4340  * event that will be dispatched when the socket has read activity.  The
4341  * watcher will send the internal event to the task when there is a new
4342  * connection.
4343  *
4344  * Unlike in read, we don't preallocate a done event here.  Every time there
4345  * is a new connection we'll have to allocate a new one anyway, so we might
4346  * as well keep things simple rather than having to track them.
4347  */
4348 isc_result_t
4349 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
4350         char strbuf[ISC_STRERRORSIZE];
4351
4352         REQUIRE(VALID_SOCKET(sock));
4353
4354         LOCK(&sock->lock);
4355
4356         REQUIRE(!sock->listener);
4357         REQUIRE(sock->bound);
4358         REQUIRE(sock->type == isc_sockettype_tcp ||
4359                 sock->type == isc_sockettype_unix);
4360
4361         if (backlog == 0)
4362                 backlog = SOMAXCONN;
4363
4364         if (listen(sock->fd, (int)backlog) < 0) {
4365                 UNLOCK(&sock->lock);
4366                 isc__strerror(errno, strbuf, sizeof(strbuf));
4367
4368                 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
4369
4370                 return (ISC_R_UNEXPECTED);
4371         }
4372
4373         sock->listener = 1;
4374
4375         UNLOCK(&sock->lock);
4376         return (ISC_R_SUCCESS);
4377 }
4378
4379 /*
4380  * This should try to do aggressive accept() XXXMLG
4381  */
4382 isc_result_t
4383 isc_socket_accept(isc_socket_t *sock,
4384                   isc_task_t *task, isc_taskaction_t action, const void *arg)
4385 {
4386         isc_socket_newconnev_t *dev;
4387         isc_socketmgr_t *manager;
4388         isc_task_t *ntask = NULL;
4389         isc_socket_t *nsock;
4390         isc_result_t result;
4391         isc_boolean_t do_poke = ISC_FALSE;
4392
4393         REQUIRE(VALID_SOCKET(sock));
4394         manager = sock->manager;
4395         REQUIRE(VALID_MANAGER(manager));
4396
4397         LOCK(&sock->lock);
4398
4399         REQUIRE(sock->listener);
4400
4401         /*
4402          * Sender field is overloaded here with the task we will be sending
4403          * this event to.  Just before the actual event is delivered the
4404          * actual ev_sender will be touched up to be the socket.
4405          */
4406         dev = (isc_socket_newconnev_t *)
4407                 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
4408                                    action, arg, sizeof(*dev));
4409         if (dev == NULL) {
4410                 UNLOCK(&sock->lock);
4411                 return (ISC_R_NOMEMORY);
4412         }
4413         ISC_LINK_INIT(dev, ev_link);
4414
4415         result = allocate_socket(manager, sock->type, &nsock);
4416         if (result != ISC_R_SUCCESS) {
4417                 isc_event_free(ISC_EVENT_PTR(&dev));
4418                 UNLOCK(&sock->lock);
4419                 return (result);
4420         }
4421
4422         /*
4423          * Attach to socket and to task.
4424          */
4425         isc_task_attach(task, &ntask);
4426         nsock->references++;
4427
4428         dev->ev_sender = ntask;
4429         dev->newsocket = nsock;
4430
4431         /*
4432          * Poke watcher here.  We still have the socket locked, so there
4433          * is no race condition.  We will keep the lock for such a short
4434          * bit of time waking it up now or later won't matter all that much.
4435          */
4436         if (ISC_LIST_EMPTY(sock->accept_list))
4437                 do_poke = ISC_TRUE;
4438
4439         ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
4440
4441         if (do_poke)
4442                 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
4443
4444         UNLOCK(&sock->lock);
4445         return (ISC_R_SUCCESS);
4446 }
4447
4448 isc_result_t
4449 isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
4450                    isc_task_t *task, isc_taskaction_t action, const void *arg)
4451 {
4452         isc_socket_connev_t *dev;
4453         isc_task_t *ntask = NULL;
4454         isc_socketmgr_t *manager;
4455         int cc;
4456         char strbuf[ISC_STRERRORSIZE];
4457
4458         REQUIRE(VALID_SOCKET(sock));
4459         REQUIRE(addr != NULL);
4460         REQUIRE(task != NULL);
4461         REQUIRE(action != NULL);
4462
4463         manager = sock->manager;
4464         REQUIRE(VALID_MANAGER(manager));
4465         REQUIRE(addr != NULL);
4466
4467         if (isc_sockaddr_ismulticast(addr))
4468                 return (ISC_R_MULTICAST);
4469
4470         LOCK(&sock->lock);
4471
4472         REQUIRE(!sock->connecting);
4473
4474         dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
4475                                                         ISC_SOCKEVENT_CONNECT,
4476                                                         action, arg,
4477                                                         sizeof(*dev));
4478         if (dev == NULL) {
4479                 UNLOCK(&sock->lock);
4480                 return (ISC_R_NOMEMORY);
4481         }
4482         ISC_LINK_INIT(dev, ev_link);
4483
4484         /*
4485          * Try to do the connect right away, as there can be only one
4486          * outstanding, and it might happen to complete.
4487          */
4488         sock->address = *addr;
4489         cc = connect(sock->fd, &addr->type.sa, addr->length);
4490         if (cc < 0) {
4491                 /*
4492                  * HP-UX "fails" to connect a UDP socket and sets errno to
4493                  * EINPROGRESS if it's non-blocking.  We'd rather regard this as
4494                  * a success and let the user detect it if it's really an error
4495                  * at the time of sending a packet on the socket.
4496                  */
4497                 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
4498                         cc = 0;
4499                         goto success;
4500                 }
4501                 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
4502                         goto queue;
4503
4504                 switch (errno) {
4505 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
4506                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
4507                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4508                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4509                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4510                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4511 #ifdef EHOSTDOWN
4512                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4513 #endif
4514                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4515                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4516                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4517                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4518                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4519 #undef ERROR_MATCH
4520                 }
4521
4522                 sock->connected = 0;
4523
4524                 isc__strerror(errno, strbuf, sizeof(strbuf));
4525                 UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", errno, strbuf);
4526
4527                 UNLOCK(&sock->lock);
4528                 isc_event_free(ISC_EVENT_PTR(&dev));
4529                 return (ISC_R_UNEXPECTED);
4530
4531         err_exit:
4532                 sock->connected = 0;
4533                 isc_task_send(task, ISC_EVENT_PTR(&dev));
4534
4535                 UNLOCK(&sock->lock);
4536                 return (ISC_R_SUCCESS);
4537         }
4538
4539         /*
4540          * If connect completed, fire off the done event.
4541          */
4542  success:
4543         if (cc == 0) {
4544                 sock->connected = 1;
4545                 sock->bound = 1;
4546                 dev->result = ISC_R_SUCCESS;
4547                 isc_task_send(task, ISC_EVENT_PTR(&dev));
4548
4549                 UNLOCK(&sock->lock);
4550                 return (ISC_R_SUCCESS);
4551         }
4552
4553  queue:
4554
4555         /*
4556          * Attach to task.
4557          */
4558         isc_task_attach(task, &ntask);
4559
4560         sock->connecting = 1;
4561
4562         dev->ev_sender = ntask;
4563
4564         /*
4565          * Poke watcher here.  We still have the socket locked, so there
4566          * is no race condition.  We will keep the lock for such a short
4567          * bit of time waking it up now or later won't matter all that much.
4568          */
4569         if (sock->connect_ev == NULL)
4570                 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
4571
4572         sock->connect_ev = dev;
4573
4574         UNLOCK(&sock->lock);
4575         return (ISC_R_SUCCESS);
4576 }
4577
4578 /*
4579  * Called when a socket with a pending connect() finishes.
4580  */
4581 static void
4582 internal_connect(isc_task_t *me, isc_event_t *ev) {
4583         isc_socket_t *sock;
4584         isc_socket_connev_t *dev;
4585         isc_task_t *task;
4586         int cc;
4587         ISC_SOCKADDR_LEN_T optlen;
4588         char strbuf[ISC_STRERRORSIZE];
4589         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
4590
4591         UNUSED(me);
4592         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
4593
4594         sock = ev->ev_sender;
4595         INSIST(VALID_SOCKET(sock));
4596
4597         LOCK(&sock->lock);
4598
4599         /*
4600          * When the internal event was sent the reference count was bumped
4601          * to keep the socket around for us.  Decrement the count here.
4602          */
4603         INSIST(sock->references > 0);
4604         sock->references--;
4605         if (sock->references == 0) {
4606                 UNLOCK(&sock->lock);
4607                 destroy(&sock);
4608                 return;
4609         }
4610
4611         /*
4612          * Has this event been canceled?
4613          */
4614         dev = sock->connect_ev;
4615         if (dev == NULL) {
4616                 INSIST(!sock->connecting);
4617                 UNLOCK(&sock->lock);
4618                 return;
4619         }
4620
4621         INSIST(sock->connecting);
4622         sock->connecting = 0;
4623
4624         /*
4625          * Get any possible error status here.
4626          */
4627         optlen = sizeof(cc);
4628         if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
4629                        (void *)&cc, (void *)&optlen) < 0)
4630                 cc = errno;
4631         else
4632                 errno = cc;
4633
4634         if (errno != 0) {
4635                 /*
4636                  * If the error is EAGAIN, just re-select on this
4637                  * fd and pretend nothing strange happened.
4638                  */
4639                 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4640                         sock->connecting = 1;
4641                         select_poke(sock->manager, sock->fd,
4642                                     SELECT_POKE_CONNECT);
4643                         UNLOCK(&sock->lock);
4644
4645                         return;
4646                 }
4647
4648                 /*
4649                  * Translate other errors into ISC_R_* flavors.
4650                  */
4651                 switch (errno) {
4652 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
4653                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
4654                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4655                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4656                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4657                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4658 #ifdef EHOSTDOWN
4659                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4660 #endif
4661                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4662                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4663                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4664                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4665                         ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
4666                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4667 #undef ERROR_MATCH
4668                 default:
4669                         dev->result = ISC_R_UNEXPECTED;
4670                         isc_sockaddr_format(&sock->address, peerbuf,
4671                                             sizeof(peerbuf));
4672                         isc__strerror(errno, strbuf, sizeof(strbuf));
4673                         UNEXPECTED_ERROR(__FILE__, __LINE__,
4674                                          "internal_connect: connect(%s) %s",
4675                                          peerbuf, strbuf);
4676                 }
4677         } else {
4678                 dev->result = ISC_R_SUCCESS;
4679                 sock->connected = 1;
4680                 sock->bound = 1;
4681         }
4682
4683         sock->connect_ev = NULL;
4684
4685         UNLOCK(&sock->lock);
4686
4687         task = dev->ev_sender;
4688         dev->ev_sender = sock;
4689         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
4690 }
4691
4692 isc_result_t
4693 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
4694         isc_result_t result;
4695
4696         REQUIRE(VALID_SOCKET(sock));
4697         REQUIRE(addressp != NULL);
4698
4699         LOCK(&sock->lock);
4700
4701         if (sock->connected) {
4702                 *addressp = sock->address;
4703                 result = ISC_R_SUCCESS;
4704         } else {
4705                 result = ISC_R_NOTCONNECTED;
4706         }
4707
4708         UNLOCK(&sock->lock);
4709
4710         return (result);
4711 }
4712
4713 isc_result_t
4714 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
4715         ISC_SOCKADDR_LEN_T len;
4716         isc_result_t result;
4717         char strbuf[ISC_STRERRORSIZE];
4718
4719         REQUIRE(VALID_SOCKET(sock));
4720         REQUIRE(addressp != NULL);
4721
4722         LOCK(&sock->lock);
4723
4724         if (!sock->bound) {
4725                 result = ISC_R_NOTBOUND;
4726                 goto out;
4727         }
4728
4729         result = ISC_R_SUCCESS;
4730
4731         len = sizeof(addressp->type);
4732         if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
4733                 isc__strerror(errno, strbuf, sizeof(strbuf));
4734                 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
4735                                  strbuf);
4736                 result = ISC_R_UNEXPECTED;
4737                 goto out;
4738         }
4739         addressp->length = (unsigned int)len;
4740
4741  out:
4742         UNLOCK(&sock->lock);
4743
4744         return (result);
4745 }
4746
4747 /*
4748  * Run through the list of events on this socket, and cancel the ones
4749  * queued for task "task" of type "how".  "how" is a bitmask.
4750  */
4751 void
4752 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
4753
4754         REQUIRE(VALID_SOCKET(sock));
4755
4756         /*
4757          * Quick exit if there is nothing to do.  Don't even bother locking
4758          * in this case.
4759          */
4760         if (how == 0)
4761                 return;
4762
4763         LOCK(&sock->lock);
4764
4765         /*
4766          * All of these do the same thing, more or less.
4767          * Each will:
4768          *      o If the internal event is marked as "posted" try to
4769          *        remove it from the task's queue.  If this fails, mark it
4770          *        as canceled instead, and let the task clean it up later.
4771          *      o For each I/O request for that task of that type, post
4772          *        its done event with status of "ISC_R_CANCELED".
4773          *      o Reset any state needed.
4774          */
4775         if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
4776             && !ISC_LIST_EMPTY(sock->recv_list)) {
4777                 isc_socketevent_t      *dev;
4778                 isc_socketevent_t      *next;
4779                 isc_task_t             *current_task;
4780
4781                 dev = ISC_LIST_HEAD(sock->recv_list);
4782
4783                 while (dev != NULL) {
4784                         current_task = dev->ev_sender;
4785                         next = ISC_LIST_NEXT(dev, ev_link);
4786
4787                         if ((task == NULL) || (task == current_task)) {
4788                                 dev->result = ISC_R_CANCELED;
4789                                 send_recvdone_event(sock, &dev);
4790                         }
4791                         dev = next;
4792                 }
4793         }
4794
4795         if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
4796             && !ISC_LIST_EMPTY(sock->send_list)) {
4797                 isc_socketevent_t      *dev;
4798                 isc_socketevent_t      *next;
4799                 isc_task_t             *current_task;
4800
4801                 dev = ISC_LIST_HEAD(sock->send_list);
4802
4803                 while (dev != NULL) {
4804                         current_task = dev->ev_sender;
4805                         next = ISC_LIST_NEXT(dev, ev_link);
4806
4807                         if ((task == NULL) || (task == current_task)) {
4808                                 dev->result = ISC_R_CANCELED;
4809                                 send_senddone_event(sock, &dev);
4810                         }
4811                         dev = next;
4812                 }
4813         }
4814
4815         if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
4816             && !ISC_LIST_EMPTY(sock->accept_list)) {
4817                 isc_socket_newconnev_t *dev;
4818                 isc_socket_newconnev_t *next;
4819                 isc_task_t             *current_task;
4820
4821                 dev = ISC_LIST_HEAD(sock->accept_list);
4822                 while (dev != NULL) {
4823                         current_task = dev->ev_sender;
4824                         next = ISC_LIST_NEXT(dev, ev_link);
4825
4826                         if ((task == NULL) || (task == current_task)) {
4827
4828                                 ISC_LIST_UNLINK(sock->accept_list, dev,
4829                                                 ev_link);
4830
4831                                 dev->newsocket->references--;
4832                                 free_socket(&dev->newsocket);
4833
4834                                 dev->result = ISC_R_CANCELED;
4835                                 dev->ev_sender = sock;
4836                                 isc_task_sendanddetach(&current_task,
4837                                                        ISC_EVENT_PTR(&dev));
4838                         }
4839
4840                         dev = next;
4841                 }
4842         }
4843
4844         /*
4845          * Connecting is not a list.
4846          */
4847         if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
4848             && sock->connect_ev != NULL) {
4849                 isc_socket_connev_t    *dev;
4850                 isc_task_t             *current_task;
4851
4852                 INSIST(sock->connecting);
4853                 sock->connecting = 0;
4854
4855                 dev = sock->connect_ev;
4856                 current_task = dev->ev_sender;
4857
4858                 if ((task == NULL) || (task == current_task)) {
4859                         sock->connect_ev = NULL;
4860
4861                         dev->result = ISC_R_CANCELED;
4862                         dev->ev_sender = sock;
4863                         isc_task_sendanddetach(&current_task,
4864                                                ISC_EVENT_PTR(&dev));
4865                 }
4866         }
4867
4868         UNLOCK(&sock->lock);
4869 }
4870
4871 isc_sockettype_t
4872 isc_socket_gettype(isc_socket_t *sock) {
4873         REQUIRE(VALID_SOCKET(sock));
4874
4875         return (sock->type);
4876 }
4877
4878 isc_boolean_t
4879 isc_socket_isbound(isc_socket_t *sock) {
4880         isc_boolean_t val;
4881
4882         LOCK(&sock->lock);
4883         val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
4884         UNLOCK(&sock->lock);
4885
4886         return (val);
4887 }
4888
4889 void
4890 isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
4891 #if defined(IPV6_V6ONLY)
4892         int onoff = yes ? 1 : 0;
4893 #else
4894         UNUSED(yes);
4895         UNUSED(sock);
4896 #endif
4897
4898         REQUIRE(VALID_SOCKET(sock));
4899
4900 #ifdef IPV6_V6ONLY
4901         if (sock->pf == AF_INET6) {
4902                 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
4903                                (void *)&onoff, sizeof(int)) < 0) {
4904                         char strbuf[ISC_STRERRORSIZE];
4905         
4906                         UNEXPECTED_ERROR(__FILE__, __LINE__,
4907                                          "setsockopt(%d, IPV6_V6ONLY) "
4908                                          "%s: %s", sock->fd,
4909                                          isc_msgcat_get(isc_msgcat,
4910                                                         ISC_MSGSET_GENERAL,
4911                                                         ISC_MSG_FAILED,
4912                                                         "failed"),
4913                                          strbuf);
4914                 }
4915         }
4916         FIX_IPV6_RECVPKTINFO(sock);     /* AIX */
4917 #endif
4918 }
4919
4920 #ifndef ISC_PLATFORM_USETHREADS
4921 /* In our assumed scenario, we can simply use a single static object. */
4922 static isc_socketwait_t swait_private;
4923
4924 int
4925 isc__socketmgr_waitevents(struct timeval *tvp, isc_socketwait_t **swaitp) {
4926         int n;
4927 #ifdef USE_KQUEUE
4928         struct timespec ts, *tsp;
4929 #endif
4930 #ifdef USE_EPOLL
4931         int timeout;
4932 #endif
4933 #ifdef USE_DEVPOLL
4934         struct dvpoll dvp;
4935 #endif
4936
4937         REQUIRE(swaitp != NULL && *swaitp == NULL);
4938
4939         if (socketmgr == NULL)
4940                 return (0);
4941
4942 #ifdef USE_KQUEUE
4943         if (tvp != NULL) {
4944                 ts.tv_sec = tvp->tv_sec;
4945                 ts.tv_nsec = tvp->tv_usec * 1000;
4946                 tsp = &ts;
4947         } else
4948                 tsp = NULL;
4949         swait_private.nevents = kevent(socketmgr->kqueue_fd, NULL, 0,
4950                                        socketmgr->events, socketmgr->nevents,
4951                                        tsp);
4952         n = swait_private.nevents;
4953 #elif defined(USE_EPOLL)
4954         if (tvp != NULL)
4955                 timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000;
4956         else
4957                 timeout = -1;
4958         swait_private.nevents = epoll_wait(socketmgr->epoll_fd,
4959                                            socketmgr->events,
4960                                            socketmgr->nevents, timeout);
4961         n = swait_private.nevents;
4962 #elif defined(USE_DEVPOLL)
4963         dvp.dp_fds = socketmgr->events;
4964         dvp.dp_nfds = socketmgr->nevents;
4965         if (tvp != NULL) {
4966                 dvp.dp_timeout = tvp->tv_sec * 1000 +
4967                         (tvp->tv_usec + 999) / 1000;
4968         } else
4969                 dvp.dp_timeout = -1;
4970         swait_private.nevents = ioctl(socketmgr->devpoll_fd, DP_POLL, &dvp);
4971         n = swait_private.nevents;
4972 #elif defined(USE_SELECT)
4973         memcpy(socketmgr->read_fds_copy, socketmgr->read_fds,
4974                socketmgr->fd_bufsize);
4975         memcpy(socketmgr->write_fds_copy, socketmgr->write_fds,
4976                socketmgr->fd_bufsize);
4977
4978         swait_private.readset = socketmgr->read_fds_copy;
4979         swait_private.writeset = socketmgr->write_fds_copy;
4980         swait_private.maxfd = socketmgr->maxfd + 1;
4981
4982         n = select(swait_private.maxfd, swait_private.readset,
4983                    swait_private.writeset, NULL, tvp);
4984 #endif
4985
4986         *swaitp = &swait_private;
4987         return (n);
4988 }
4989
4990 isc_result_t
4991 isc__socketmgr_dispatch(isc_socketwait_t *swait) {
4992         REQUIRE(swait == &swait_private);
4993
4994         if (socketmgr == NULL)
4995                 return (ISC_R_NOTFOUND);
4996
4997 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
4998         (void)process_fds(socketmgr, socketmgr->events, swait->nevents);
4999         return (ISC_R_SUCCESS);
5000 #elif defined(USE_SELECT)
5001         process_fds(socketmgr, swait->maxfd, swait->readset, swait->writeset);
5002         return (ISC_R_SUCCESS);
5003 #endif
5004 }
5005 #endif /* ISC_PLATFORM_USETHREADS */