]> CyberLeo.Net >> Repos - FreeBSD/releng/7.2.git/blob - contrib/bind9/lib/isc/unix/socket.c
Create releng/7.2 from stable/7 in preparation for 7.2-RELEASE.
[FreeBSD/releng/7.2.git] / contrib / bind9 / lib / isc / unix / socket.c
1 /*
2  * Copyright (C) 2004-2008  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1998-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: socket.c,v 1.237.18.56.2.1 2008/12/23 00:14:34 marka Exp $ */
19
20 /*! \file */
21
22 #include <config.h>
23
24 #include <sys/param.h>
25 #include <sys/types.h>
26 #include <sys/socket.h>
27 #include <sys/stat.h>
28 #include <sys/time.h>
29 #include <sys/uio.h>
30
31 #include <errno.h>
32 #include <fcntl.h>
33 #include <stddef.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37
38 #include <isc/buffer.h>
39 #include <isc/bufferlist.h>
40 #include <isc/condition.h>
41 #include <isc/formatcheck.h>
42 #include <isc/list.h>
43 #include <isc/log.h>
44 #include <isc/mem.h>
45 #include <isc/msgs.h>
46 #include <isc/mutex.h>
47 #include <isc/net.h>
48 #include <isc/once.h>
49 #include <isc/platform.h>
50 #include <isc/print.h>
51 #include <isc/region.h>
52 #include <isc/socket.h>
53 #include <isc/strerror.h>
54 #include <isc/task.h>
55 #include <isc/thread.h>
56 #include <isc/util.h>
57
58 #ifdef ISC_PLATFORM_HAVESYSUNH
59 #include <sys/un.h>
60 #endif
61 #ifdef ISC_PLATFORM_HAVEKQUEUE
62 #include <sys/event.h>
63 #endif
64 #ifdef ISC_PLATFORM_HAVEEPOLL
65 #include <sys/epoll.h>
66 #endif
67 #ifdef ISC_PLATFORM_HAVEDEVPOLL
68 #include <sys/devpoll.h>
69 #endif
70
71 #include "errno2result.h"
72
73 #ifndef ISC_PLATFORM_USETHREADS
74 #include "socket_p.h"
75 #endif /* ISC_PLATFORM_USETHREADS */
76
77 /*%
78  * Choose the most preferable multiplex method.
79  */
80 #ifdef ISC_PLATFORM_HAVEKQUEUE
81 #define USE_KQUEUE
82 #elif defined (ISC_PLATFORM_HAVEEPOLL)
83 #define USE_EPOLL
84 #elif defined (ISC_PLATFORM_HAVEDEVPOLL)
85 #define USE_DEVPOLL
86 typedef struct {
87         unsigned int want_read : 1,
88                 want_write : 1;
89 } pollinfo_t;
90 #else
91 #define USE_SELECT
92 #endif  /* ISC_PLATFORM_HAVEKQUEUE */
93
94 #ifndef ISC_PLATFORM_USETHREADS
95 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
96 struct isc_socketwait {
97         int nevents;
98 };
99 #elif defined (USE_SELECT)
100 struct isc_socketwait {
101         fd_set *readset;
102         fd_set *writeset;
103         int nfds;
104         int maxfd;
105 };
106 #endif  /* USE_KQUEUE */
107 #endif /* !ISC_PLATFORM_USETHREADS */
108
109 /*%
110  * Maximum number of allowable open sockets.  This is also the maximum
111  * allowable socket file descriptor.
112  *
113  * Care should be taken before modifying this value for select():
114  * The API standard doesn't ensure select() accept more than (the system default
115  * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
116  * the vast majority of cases.  This constant should therefore be increased only
117  * when absolutely necessary and possible, i.e., the server is exhausting all
118  * available file descriptors (up to FD_SETSIZE) and the select() function
119  * and FD_xxx macros support larger values than FD_SETSIZE (which may not
120  * always by true, but we keep using some of them to ensure as much
121  * portability as possible).  Note also that overall server performance
122  * may be rather worsened with a larger value of this constant due to
123  * inherent scalability problems of select().
124  *
125  * As a special note, this value shouldn't have to be touched if
126  * this is a build for an authoritative only DNS server.
127  */
128 #ifndef ISC_SOCKET_MAXSOCKETS
129 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
130 #define ISC_SOCKET_MAXSOCKETS 4096
131 #elif defined(USE_SELECT)
132 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
133 #endif  /* USE_KQUEUE... */
134 #endif  /* ISC_SOCKET_MAXSOCKETS */
135
136 #ifdef USE_SELECT
137 /*%
138  * Mac OS X needs a special definition to support larger values in select().
139  * We always define this because a larger value can be specified run-time.
140  */
141 #ifdef __APPLE__
142 #define _DARWIN_UNLIMITED_SELECT
143 #endif  /* __APPLE__ */
144 #endif  /* USE_SELECT */
145
146 #ifdef ISC_SOCKET_USE_POLLWATCH
147 /*%
148  * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
149  * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
150  * some of the specified FD.  The idea is based on the observation that it's
151  * likely for a busy server to keep receiving packets.  It specifically works
152  * as follows: the socket watcher is first initialized with the state of
153  * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
154  * event occurs.  When it wakes up for a socket I/O event, it moves to the
155  * poll_active state, and sets the poll timeout to a short period
156  * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
157  * watcher goes to the poll_checking state with the same timeout period.
158  * In this state, the watcher tries to detect whether this is a break
159  * during intermittent events or the kernel bug is triggered.  If the next
160  * polling reports an event within the short period, the previous timeout is
161  * likely to be a kernel bug, and so the watcher goes back to the active state.
162  * Otherwise, it moves to the idle state again.
163  *
164  * It's not clear whether this is a thread-related bug, but since we've only
165  * seen this with threads, this workaround is used only when enabling threads.
166  */
167
168 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
169
170 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
171 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
172 #endif  /* ISC_SOCKET_POLLWATCH_TIMEOUT */
173 #endif  /* ISC_SOCKET_USE_POLLWATCH */
174
175 /*%
176  * Size of per-FD lock buckets.
177  */
178 #ifdef ISC_PLATFORM_USETHREADS
179 #define FDLOCK_COUNT            1024
180 #define FDLOCK_ID(fd)           ((fd) % FDLOCK_COUNT)
181 #else
182 #define FDLOCK_COUNT            1
183 #define FDLOCK_ID(fd)           0
184 #endif  /* ISC_PLATFORM_USETHREADS */
185
186 /*%
187  * Maximum number of events communicated with the kernel.  There should normally
188  * be no need for having a large number.
189  */
190 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
191 #ifndef ISC_SOCKET_MAXEVENTS
192 #define ISC_SOCKET_MAXEVENTS    64
193 #endif
194 #endif
195
196 /*%
197  * Some systems define the socket length argument as an int, some as size_t,
198  * some as socklen_t.  This is here so it can be easily changed if needed.
199  */
200 #ifndef ISC_SOCKADDR_LEN_T
201 #define ISC_SOCKADDR_LEN_T unsigned int
202 #endif
203
204
205 #if defined(SO_BSDCOMPAT) && defined(__linux__)
206 #include <sys/utsname.h>
207 #endif
208
209 /*%
210  * Define what the possible "soft" errors can be.  These are non-fatal returns
211  * of various network related functions, like recv() and so on.
212  *
213  * For some reason, BSDI (and perhaps others) will sometimes return <0
214  * from recv() but will have errno==0.  This is broken, but we have to
215  * work around it here.
216  */
217 #define SOFT_ERROR(e)   ((e) == EAGAIN || \
218                          (e) == EWOULDBLOCK || \
219                          (e) == EINTR || \
220                          (e) == 0)
221
222 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
223
224 /*!<
225  * DLVL(90)  --  Function entry/exit and other tracing.
226  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
227  * DLVL(60)  --  Socket data send/receive
228  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
229  * DLVL(20)  --  Socket creation/destruction.
230  */
231 #define TRACE_LEVEL             90
232 #define CORRECTNESS_LEVEL       70
233 #define IOEVENT_LEVEL           60
234 #define EVENT_LEVEL             50
235 #define CREATION_LEVEL          20
236
237 #define TRACE           DLVL(TRACE_LEVEL)
238 #define CORRECTNESS     DLVL(CORRECTNESS_LEVEL)
239 #define IOEVENT         DLVL(IOEVENT_LEVEL)
240 #define EVENT           DLVL(EVENT_LEVEL)
241 #define CREATION        DLVL(CREATION_LEVEL)
242
243 typedef isc_event_t intev_t;
244
245 #define SOCKET_MAGIC            ISC_MAGIC('I', 'O', 'i', 'o')
246 #define VALID_SOCKET(t)         ISC_MAGIC_VALID(t, SOCKET_MAGIC)
247
248 /*!
249  * IPv6 control information.  If the socket is an IPv6 socket we want
250  * to collect the destination address and interface so the client can
251  * set them on outgoing packets.
252  */
253 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
254 #ifndef USE_CMSG
255 #define USE_CMSG        1
256 #endif
257 #endif
258
259 /*%
260  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
261  * a setsockopt() like interface to request timestamps, and if the OS
262  * doesn't do it for us, call gettimeofday() on every UDP receive?
263  */
264 #ifdef SO_TIMESTAMP
265 #ifndef USE_CMSG
266 #define USE_CMSG        1
267 #endif
268 #endif
269
270 /*%
271  * The size to raise the recieve buffer to (from BIND 8).
272  */
273 #define RCVBUFSIZE (32*1024)
274
275 /*%
276  * The number of times a send operation is repeated if the result is EINTR.
277  */
278 #define NRETRIES 10
279
280 struct isc_socket {
281         /* Not locked. */
282         unsigned int            magic;
283         isc_socketmgr_t        *manager;
284         isc_mutex_t             lock;
285         isc_sockettype_t        type;
286
287         /* Locked by socket lock. */
288         ISC_LINK(isc_socket_t)  link;
289         unsigned int            references;
290         int                     fd;
291         int                     pf;
292
293         ISC_LIST(isc_socketevent_t)             send_list;
294         ISC_LIST(isc_socketevent_t)             recv_list;
295         ISC_LIST(isc_socket_newconnev_t)        accept_list;
296         isc_socket_connev_t                    *connect_ev;
297
298         /*
299          * Internal events.  Posted when a descriptor is readable or
300          * writable.  These are statically allocated and never freed.
301          * They will be set to non-purgable before use.
302          */
303         intev_t                 readable_ev;
304         intev_t                 writable_ev;
305
306         isc_sockaddr_t          address;  /* remote address */
307
308         unsigned int            pending_recv : 1,
309                                 pending_send : 1,
310                                 pending_accept : 1,
311                                 listener : 1, /* listener socket */
312                                 connected : 1,
313                                 connecting : 1, /* connect pending */
314                                 bound : 1; /* bound to local addr */
315
316 #ifdef ISC_NET_RECVOVERFLOW
317         unsigned char           overflow; /* used for MSG_TRUNC fake */
318 #endif
319
320         char                    *recvcmsgbuf;
321         ISC_SOCKADDR_LEN_T      recvcmsgbuflen;
322         char                    *sendcmsgbuf;
323         ISC_SOCKADDR_LEN_T      sendcmsgbuflen;
324 };
325
326 #define SOCKET_MANAGER_MAGIC    ISC_MAGIC('I', 'O', 'm', 'g')
327 #define VALID_MANAGER(m)        ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
328
329 struct isc_socketmgr {
330         /* Not locked. */
331         unsigned int            magic;
332         isc_mem_t              *mctx;
333         isc_mutex_t             lock;
334         isc_mutex_t             *fdlock;
335 #ifdef USE_KQUEUE
336         int                     kqueue_fd;
337         int                     nevents;
338         struct kevent           *events;
339 #endif  /* USE_KQUEUE */
340 #ifdef USE_EPOLL
341         int                     epoll_fd;
342         int                     nevents;
343         struct epoll_event      *events;
344 #endif  /* USE_EPOLL */
345 #ifdef USE_DEVPOLL
346         int                     devpoll_fd;
347         int                     nevents;
348         struct pollfd           *events;
349 #endif  /* USE_DEVPOLL */
350 #ifdef USE_SELECT
351         int                     fd_bufsize;
352 #endif  /* USE_SELECT */
353         unsigned int            maxsocks;
354 #ifdef ISC_PLATFORM_USETHREADS
355         int                     pipe_fds[2];
356 #endif
357
358         /* Locked by fdlock. */
359         isc_socket_t           **fds;
360         int                     *fdstate;
361 #ifdef USE_DEVPOLL
362         pollinfo_t              *fdpollinfo;
363 #endif
364
365         /* Locked by manager lock. */
366         ISC_LIST(isc_socket_t)  socklist;
367 #ifdef USE_SELECT
368         fd_set                  *read_fds;
369         fd_set                  *read_fds_copy;
370         fd_set                  *write_fds;
371         fd_set                  *write_fds_copy;
372         int                     maxfd;
373 #endif  /* USE_SELECT */
374         int                     reserved;       /* unlocked */
375 #ifdef ISC_PLATFORM_USETHREADS
376         isc_thread_t            watcher;
377         isc_condition_t         shutdown_ok;
378 #else /* ISC_PLATFORM_USETHREADS */
379         unsigned int            refs;
380 #endif /* ISC_PLATFORM_USETHREADS */
381 };
382
383 #ifndef ISC_PLATFORM_USETHREADS
384 static isc_socketmgr_t *socketmgr = NULL;
385 #endif /* ISC_PLATFORM_USETHREADS */
386
387 #define CLOSED          0       /* this one must be zero */
388 #define MANAGED         1
389 #define CLOSE_PENDING   2
390
391 /*
392  * send() and recv() iovec counts
393  */
394 #define MAXSCATTERGATHER_SEND   (ISC_SOCKET_MAXSCATTERGATHER)
395 #ifdef ISC_NET_RECVOVERFLOW
396 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER + 1)
397 #else
398 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER)
399 #endif
400
401 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
402 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
403 static void free_socket(isc_socket_t **);
404 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
405                                     isc_socket_t **);
406 static void destroy(isc_socket_t **);
407 static void internal_accept(isc_task_t *, isc_event_t *);
408 static void internal_connect(isc_task_t *, isc_event_t *);
409 static void internal_recv(isc_task_t *, isc_event_t *);
410 static void internal_send(isc_task_t *, isc_event_t *);
411 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
412 static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
413                               struct msghdr *, struct iovec *, size_t *);
414 static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
415                               struct msghdr *, struct iovec *, size_t *);
416 #ifdef ISC_PLATFORM_USETHREADS
417 static isc_boolean_t process_ctlfd(isc_socketmgr_t *manager);
418 #endif
419
420 #define SELECT_POKE_SHUTDOWN            (-1)
421 #define SELECT_POKE_NOTHING             (-2)
422 #define SELECT_POKE_READ                (-3)
423 #define SELECT_POKE_ACCEPT              (-3) /*%< Same as _READ */
424 #define SELECT_POKE_WRITE               (-4)
425 #define SELECT_POKE_CONNECT             (-4) /*%< Same as _WRITE */
426 #define SELECT_POKE_CLOSE               (-5)
427
428 #define SOCK_DEAD(s)                    ((s)->references == 0)
429
430 static void
431 manager_log(isc_socketmgr_t *sockmgr,
432             isc_logcategory_t *category, isc_logmodule_t *module, int level,
433             const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
434 static void
435 manager_log(isc_socketmgr_t *sockmgr,
436             isc_logcategory_t *category, isc_logmodule_t *module, int level,
437             const char *fmt, ...)
438 {
439         char msgbuf[2048];
440         va_list ap;
441
442         if (! isc_log_wouldlog(isc_lctx, level))
443                 return;
444
445         va_start(ap, fmt);
446         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
447         va_end(ap);
448
449         isc_log_write(isc_lctx, category, module, level,
450                       "sockmgr %p: %s", sockmgr, msgbuf);
451 }
452
453 static void
454 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
455            isc_logcategory_t *category, isc_logmodule_t *module, int level,
456            isc_msgcat_t *msgcat, int msgset, int message,
457            const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
458 static void
459 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
460            isc_logcategory_t *category, isc_logmodule_t *module, int level,
461            isc_msgcat_t *msgcat, int msgset, int message,
462            const char *fmt, ...)
463 {
464         char msgbuf[2048];
465         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
466         va_list ap;
467
468         if (! isc_log_wouldlog(isc_lctx, level))
469                 return;
470
471         va_start(ap, fmt);
472         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
473         va_end(ap);
474
475         if (address == NULL) {
476                 isc_log_iwrite(isc_lctx, category, module, level,
477                                msgcat, msgset, message,
478                                "socket %p: %s", sock, msgbuf);
479         } else {
480                 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
481                 isc_log_iwrite(isc_lctx, category, module, level,
482                                msgcat, msgset, message,
483                                "socket %p %s: %s", sock, peerbuf, msgbuf);
484         }
485 }
486
487 #if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
488     defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
489 /*
490  * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
491  * setting IPV6_V6ONLY.
492  */
493 static void
494 FIX_IPV6_RECVPKTINFO(isc_socket_t *sock)
495 {
496         char strbuf[ISC_STRERRORSIZE];
497         int on = 1;
498
499         if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
500                 return;
501
502         if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
503                        (void *)&on, sizeof(on)) < 0) {
504
505                 UNEXPECTED_ERROR(__FILE__, __LINE__,
506                                  "setsockopt(%d, IPV6_RECVPKTINFO) "
507                                  "%s: %s", sock->fd,
508                                  isc_msgcat_get(isc_msgcat,
509                                                 ISC_MSGSET_GENERAL,
510                                                 ISC_MSG_FAILED,
511                                                 "failed"),
512                                  strbuf);
513         }
514 }
515 #else
516 #define FIX_IPV6_RECVPKTINFO(sock) (void)0
517 #endif
518
519 static inline isc_result_t
520 watch_fd(isc_socketmgr_t *manager, int fd, int msg) {
521         isc_result_t result = ISC_R_SUCCESS;
522
523 #ifdef USE_KQUEUE
524         struct kevent evchange;
525
526         memset(&evchange, 0, sizeof(evchange));
527         if (msg == SELECT_POKE_READ)
528                 evchange.filter = EVFILT_READ;
529         else
530                 evchange.filter = EVFILT_WRITE;
531         evchange.flags = EV_ADD;
532         evchange.ident = fd;
533         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
534                 result = isc__errno2result(errno);
535
536         return (result);
537 #elif defined(USE_EPOLL)
538         struct epoll_event event;
539
540         if (msg == SELECT_POKE_READ)
541                 event.events = EPOLLIN;
542         else
543                 event.events = EPOLLOUT;
544         event.data.fd = fd;
545         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
546             errno != EEXIST) {
547                 result = isc__errno2result(errno);
548         }
549
550         return (result);
551 #elif defined(USE_DEVPOLL)
552         struct pollfd pfd;
553         int lockid = FDLOCK_ID(fd);
554
555         memset(&pfd, 0, sizeof(pfd));
556         if (msg == SELECT_POKE_READ)
557                 pfd.events = POLLIN;
558         else
559                 pfd.events = POLLOUT;
560         pfd.fd = fd;
561         pfd.revents = 0;
562         LOCK(&manager->fdlock[lockid]);
563         if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
564                 result = isc__errno2result(errno);
565         else {
566                 if (msg == SELECT_POKE_READ)
567                         manager->fdpollinfo[fd].want_read = 1;
568                 else
569                         manager->fdpollinfo[fd].want_write = 1;
570         }
571         UNLOCK(&manager->fdlock[lockid]);
572
573         return (result);
574 #elif defined(USE_SELECT)
575         LOCK(&manager->lock);
576         if (msg == SELECT_POKE_READ)
577                 FD_SET(fd, manager->read_fds);
578         if (msg == SELECT_POKE_WRITE)
579                 FD_SET(fd, manager->write_fds);
580         UNLOCK(&manager->lock);
581
582         return (result);
583 #endif
584 }
585
586 static inline isc_result_t
587 unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) {
588         isc_result_t result = ISC_R_SUCCESS;
589
590 #ifdef USE_KQUEUE
591         struct kevent evchange;
592
593         memset(&evchange, 0, sizeof(evchange));
594         if (msg == SELECT_POKE_READ)
595                 evchange.filter = EVFILT_READ;
596         else
597                 evchange.filter = EVFILT_WRITE;
598         evchange.flags = EV_DELETE;
599         evchange.ident = fd;
600         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
601                 result = isc__errno2result(errno);
602
603         return (result);
604 #elif defined(USE_EPOLL)
605         struct epoll_event event;
606
607         if (msg == SELECT_POKE_READ)
608                 event.events = EPOLLIN;
609         else
610                 event.events = EPOLLOUT;
611         event.data.fd = fd;
612         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
613             errno != ENOENT) {
614                 char strbuf[ISC_STRERRORSIZE];
615                 isc__strerror(errno, strbuf, sizeof(strbuf));
616                 UNEXPECTED_ERROR(__FILE__, __LINE__,
617                                  "epoll_ctl(DEL), %d: %s", fd, strbuf);
618                 result = ISC_R_UNEXPECTED;
619         }
620         return (result);
621 #elif defined(USE_DEVPOLL)
622         struct pollfd pfds[2];
623         size_t writelen = sizeof(pfds[0]);
624         int lockid = FDLOCK_ID(fd);
625
626         memset(pfds, 0, sizeof(pfds));
627         pfds[0].events = POLLREMOVE;
628         pfds[0].fd = fd;
629
630         /*
631          * Canceling read or write polling via /dev/poll is tricky.  Since it
632          * only provides a way of canceling per FD, we may need to re-poll the
633          * socket for the other operation.
634          */
635         LOCK(&manager->fdlock[lockid]);
636         if (msg == SELECT_POKE_READ &&
637             manager->fdpollinfo[fd].want_write == 1) {
638                 pfds[1].events = POLLOUT;
639                 pfds[1].fd = fd;
640                 writelen += sizeof(pfds[1]);
641         }
642         if (msg == SELECT_POKE_WRITE &&
643             manager->fdpollinfo[fd].want_read == 1) {
644                 pfds[1].events = POLLIN;
645                 pfds[1].fd = fd;
646                 writelen += sizeof(pfds[1]);
647         }
648
649         if (write(manager->devpoll_fd, pfds, writelen) == -1)
650                 result = isc__errno2result(errno);
651         else {
652                 if (msg == SELECT_POKE_READ)
653                         manager->fdpollinfo[fd].want_read = 0;
654                 else
655                         manager->fdpollinfo[fd].want_write = 0;
656         }
657         UNLOCK(&manager->fdlock[lockid]);
658
659         return (result);
660 #elif defined(USE_SELECT)
661         LOCK(&manager->lock);
662         if (msg == SELECT_POKE_READ)
663                 FD_CLR(fd, manager->read_fds);
664         else if (msg == SELECT_POKE_WRITE)
665                 FD_CLR(fd, manager->write_fds);
666         UNLOCK(&manager->lock);
667
668         return (result);
669 #endif
670 }
671
672 static void
673 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
674         isc_result_t result;
675         int lockid = FDLOCK_ID(fd);
676
677         /*
678          * This is a wakeup on a socket.  If the socket is not in the
679          * process of being closed, start watching it for either reads
680          * or writes.
681          */
682
683         INSIST(fd >= 0 && fd < (int)manager->maxsocks);
684
685         if (msg == SELECT_POKE_CLOSE) {
686                 /* No one should be updating fdstate, so no need to lock it */
687                 INSIST(manager->fdstate[fd] == CLOSE_PENDING);
688                 manager->fdstate[fd] = CLOSED;
689                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
690                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
691                 (void)close(fd);
692                 return;
693         }
694
695         LOCK(&manager->fdlock[lockid]);
696         if (manager->fdstate[fd] == CLOSE_PENDING) {
697                 UNLOCK(&manager->fdlock[lockid]);
698                 /*
699                  * We accept (and ignore) any error from unwatch_fd() as we are
700                  * closing the socket, hoping it doesn't leave dangling state in
701                  * the kernel.
702                  * Note that unwatch_fd() must be called after releasing the
703                  * fdlock; otherwise it could cause deadlock due to a lock order
704                  * reversal.
705                  */
706                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
707                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
708                 return;
709         }
710         if (manager->fdstate[fd] != MANAGED) {
711                 UNLOCK(&manager->fdlock[lockid]);
712                 return;
713         }
714         UNLOCK(&manager->fdlock[lockid]);
715
716         /*
717          * Set requested bit.
718          */
719         result = watch_fd(manager, fd, msg);
720         if (result != ISC_R_SUCCESS) {
721                 /*
722                  * XXXJT: what should we do?  Ignoring the failure of watching
723                  * a socket will make the application dysfunctional, but there
724                  * seems to be no reasonable recovery process.
725                  */
726                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
727                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
728                               "failed to start watching FD (%d): %s",
729                               fd, isc_result_totext(result));
730         }
731 }
732
733 #ifdef ISC_PLATFORM_USETHREADS
734 /*
735  * Poke the select loop when there is something for us to do.
736  * The write is required (by POSIX) to complete.  That is, we
737  * will not get partial writes.
738  */
739 static void
740 select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
741         int cc;
742         int buf[2];
743         char strbuf[ISC_STRERRORSIZE];
744
745         buf[0] = fd;
746         buf[1] = msg;
747
748         do {
749                 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
750 #ifdef ENOSR
751                 /*
752                  * Treat ENOSR as EAGAIN but loop slowly as it is
753                  * unlikely to clear fast.
754                  */
755                 if (cc < 0 && errno == ENOSR) {
756                         sleep(1);
757                         errno = EAGAIN;
758                 }
759 #endif
760         } while (cc < 0 && SOFT_ERROR(errno));
761
762         if (cc < 0) {
763                 isc__strerror(errno, strbuf, sizeof(strbuf));
764                 FATAL_ERROR(__FILE__, __LINE__,
765                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
766                                            ISC_MSG_WRITEFAILED,
767                                            "write() failed "
768                                            "during watcher poke: %s"),
769                             strbuf);
770         }
771
772         INSIST(cc == sizeof(buf));
773 }
774
775 /*
776  * Read a message on the internal fd.
777  */
778 static void
779 select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
780         int buf[2];
781         int cc;
782         char strbuf[ISC_STRERRORSIZE];
783
784         cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
785         if (cc < 0) {
786                 *msg = SELECT_POKE_NOTHING;
787                 *fd = -1;       /* Silence compiler. */
788                 if (SOFT_ERROR(errno))
789                         return;
790
791                 isc__strerror(errno, strbuf, sizeof(strbuf));
792                 FATAL_ERROR(__FILE__, __LINE__,
793                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
794                                            ISC_MSG_READFAILED,
795                                            "read() failed "
796                                            "during watcher poke: %s"),
797                             strbuf);
798
799                 return;
800         }
801         INSIST(cc == sizeof(buf));
802
803         *fd = buf[0];
804         *msg = buf[1];
805 }
806 #else /* ISC_PLATFORM_USETHREADS */
807 /*
808  * Update the state of the socketmgr when something changes.
809  */
810 static void
811 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
812         if (msg == SELECT_POKE_SHUTDOWN)
813                 return;
814         else if (fd >= 0)
815                 wakeup_socket(manager, fd, msg);
816         return;
817 }
818 #endif /* ISC_PLATFORM_USETHREADS */
819
820 /*
821  * Make a fd non-blocking.
822  */
823 static isc_result_t
824 make_nonblock(int fd) {
825         int ret;
826         int flags;
827         char strbuf[ISC_STRERRORSIZE];
828 #ifdef USE_FIONBIO_IOCTL
829         int on = 1;
830
831         ret = ioctl(fd, FIONBIO, (char *)&on);
832 #else
833         flags = fcntl(fd, F_GETFL, 0);
834         flags |= PORT_NONBLOCK;
835         ret = fcntl(fd, F_SETFL, flags);
836 #endif
837
838         if (ret == -1) {
839                 isc__strerror(errno, strbuf, sizeof(strbuf));
840                 UNEXPECTED_ERROR(__FILE__, __LINE__,
841 #ifdef USE_FIONBIO_IOCTL
842                                  "ioctl(%d, FIONBIO, &on): %s", fd,
843 #else
844                                  "fcntl(%d, F_SETFL, %d): %s", fd, flags,
845 #endif
846                                  strbuf);
847
848                 return (ISC_R_UNEXPECTED);
849         }
850
851         return (ISC_R_SUCCESS);
852 }
853
854 #ifdef USE_CMSG
855 /*
856  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
857  * In order to ensure as much portability as possible, we provide wrapper
858  * functions of these macros.
859  * Note that cmsg_space() could run slow on OSes that do not have
860  * CMSG_SPACE.
861  */
862 static inline ISC_SOCKADDR_LEN_T
863 cmsg_len(ISC_SOCKADDR_LEN_T len) {
864 #ifdef CMSG_LEN
865         return (CMSG_LEN(len));
866 #else
867         ISC_SOCKADDR_LEN_T hdrlen;
868
869         /*
870          * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
871          * is correct.
872          */
873         hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
874         return (hdrlen + len);
875 #endif
876 }
877
878 static inline ISC_SOCKADDR_LEN_T
879 cmsg_space(ISC_SOCKADDR_LEN_T len) {
880 #ifdef CMSG_SPACE
881         return (CMSG_SPACE(len));
882 #else
883         struct msghdr msg;
884         struct cmsghdr *cmsgp;
885         /*
886          * XXX: The buffer length is an ad-hoc value, but should be enough
887          * in a practical sense.
888          */
889         char dummybuf[sizeof(struct cmsghdr) + 1024];
890
891         memset(&msg, 0, sizeof(msg));
892         msg.msg_control = dummybuf;
893         msg.msg_controllen = sizeof(dummybuf);
894
895         cmsgp = (struct cmsghdr *)dummybuf;
896         cmsgp->cmsg_len = cmsg_len(len);
897
898         cmsgp = CMSG_NXTHDR(&msg, cmsgp);
899         if (cmsgp != NULL)
900                 return ((char *)cmsgp - (char *)msg.msg_control);
901         else
902                 return (0);
903 #endif
904 }
905 #endif /* USE_CMSG */
906
907 /*
908  * Process control messages received on a socket.
909  */
910 static void
911 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
912 #ifdef USE_CMSG
913         struct cmsghdr *cmsgp;
914 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
915         struct in6_pktinfo *pktinfop;
916 #endif
917 #ifdef SO_TIMESTAMP
918         struct timeval *timevalp;
919 #endif
920 #endif
921
922         /*
923          * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
924          * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
925          * They are all here, outside of the CPP tests, because it is
926          * more consistent with the usual ISC coding style.
927          */
928         UNUSED(sock);
929         UNUSED(msg);
930         UNUSED(dev);
931
932 #ifdef ISC_NET_BSD44MSGHDR
933
934 #ifdef MSG_TRUNC
935         if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
936                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
937 #endif
938
939 #ifdef MSG_CTRUNC
940         if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
941                 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
942 #endif
943
944 #ifndef USE_CMSG
945         return;
946 #else
947         if (msg->msg_controllen == 0U || msg->msg_control == NULL)
948                 return;
949
950 #ifdef SO_TIMESTAMP
951         timevalp = NULL;
952 #endif
953 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
954         pktinfop = NULL;
955 #endif
956
957         cmsgp = CMSG_FIRSTHDR(msg);
958         while (cmsgp != NULL) {
959                 socket_log(sock, NULL, TRACE,
960                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
961                            "processing cmsg %p", cmsgp);
962
963 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
964                 if (cmsgp->cmsg_level == IPPROTO_IPV6
965                     && cmsgp->cmsg_type == IPV6_PKTINFO) {
966
967                         pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
968                         memcpy(&dev->pktinfo, pktinfop,
969                                sizeof(struct in6_pktinfo));
970                         dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
971                         socket_log(sock, NULL, TRACE,
972                                    isc_msgcat, ISC_MSGSET_SOCKET,
973                                    ISC_MSG_IFRECEIVED,
974                                    "interface received on ifindex %u",
975                                    dev->pktinfo.ipi6_ifindex);
976                         if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
977                                 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
978                         goto next;
979                 }
980 #endif
981
982 #ifdef SO_TIMESTAMP
983                 if (cmsgp->cmsg_level == SOL_SOCKET
984                     && cmsgp->cmsg_type == SCM_TIMESTAMP) {
985                         timevalp = (struct timeval *)CMSG_DATA(cmsgp);
986                         dev->timestamp.seconds = timevalp->tv_sec;
987                         dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
988                         dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
989                         goto next;
990                 }
991 #endif
992
993         next:
994                 cmsgp = CMSG_NXTHDR(msg, cmsgp);
995         }
996 #endif /* USE_CMSG */
997
998 #endif /* ISC_NET_BSD44MSGHDR */
999 }
1000
1001 /*
1002  * Construct an iov array and attach it to the msghdr passed in.  This is
1003  * the SEND constructor, which will use the used region of the buffer
1004  * (if using a buffer list) or will use the internal region (if a single
1005  * buffer I/O is requested).
1006  *
1007  * Nothing can be NULL, and the done event must list at least one buffer
1008  * on the buffer linked list for this function to be meaningful.
1009  *
1010  * If write_countp != NULL, *write_countp will hold the number of bytes
1011  * this transaction can send.
1012  */
1013 static void
1014 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
1015                   struct msghdr *msg, struct iovec *iov, size_t *write_countp)
1016 {
1017         unsigned int iovcount;
1018         isc_buffer_t *buffer;
1019         isc_region_t used;
1020         size_t write_count;
1021         size_t skip_count;
1022
1023         memset(msg, 0, sizeof(*msg));
1024
1025         if (!sock->connected) {
1026                 msg->msg_name = (void *)&dev->address.type.sa;
1027                 msg->msg_namelen = dev->address.length;
1028         } else {
1029                 msg->msg_name = NULL;
1030                 msg->msg_namelen = 0;
1031         }
1032
1033         buffer = ISC_LIST_HEAD(dev->bufferlist);
1034         write_count = 0;
1035         iovcount = 0;
1036
1037         /*
1038          * Single buffer I/O?  Skip what we've done so far in this region.
1039          */
1040         if (buffer == NULL) {
1041                 write_count = dev->region.length - dev->n;
1042                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1043                 iov[0].iov_len = write_count;
1044                 iovcount = 1;
1045
1046                 goto config;
1047         }
1048
1049         /*
1050          * Multibuffer I/O.
1051          * Skip the data in the buffer list that we have already written.
1052          */
1053         skip_count = dev->n;
1054         while (buffer != NULL) {
1055                 REQUIRE(ISC_BUFFER_VALID(buffer));
1056                 if (skip_count < isc_buffer_usedlength(buffer))
1057                         break;
1058                 skip_count -= isc_buffer_usedlength(buffer);
1059                 buffer = ISC_LIST_NEXT(buffer, link);
1060         }
1061
1062         while (buffer != NULL) {
1063                 INSIST(iovcount < MAXSCATTERGATHER_SEND);
1064
1065                 isc_buffer_usedregion(buffer, &used);
1066
1067                 if (used.length > 0) {
1068                         iov[iovcount].iov_base = (void *)(used.base
1069                                                           + skip_count);
1070                         iov[iovcount].iov_len = used.length - skip_count;
1071                         write_count += (used.length - skip_count);
1072                         skip_count = 0;
1073                         iovcount++;
1074                 }
1075                 buffer = ISC_LIST_NEXT(buffer, link);
1076         }
1077
1078         INSIST(skip_count == 0U);
1079
1080  config:
1081         msg->msg_iov = iov;
1082         msg->msg_iovlen = iovcount;
1083
1084 #ifdef ISC_NET_BSD44MSGHDR
1085         msg->msg_control = NULL;
1086         msg->msg_controllen = 0;
1087         msg->msg_flags = 0;
1088 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1089         if ((sock->type == isc_sockettype_udp)
1090             && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
1091                 struct cmsghdr *cmsgp;
1092                 struct in6_pktinfo *pktinfop;
1093
1094                 socket_log(sock, NULL, TRACE,
1095                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
1096                            "sendto pktinfo data, ifindex %u",
1097                            dev->pktinfo.ipi6_ifindex);
1098
1099                 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1100                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1101                 msg->msg_control = (void *)sock->sendcmsgbuf;
1102
1103                 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
1104                 cmsgp->cmsg_level = IPPROTO_IPV6;
1105                 cmsgp->cmsg_type = IPV6_PKTINFO;
1106                 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1107                 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1108                 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1109         }
1110 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
1111 #else /* ISC_NET_BSD44MSGHDR */
1112         msg->msg_accrights = NULL;
1113         msg->msg_accrightslen = 0;
1114 #endif /* ISC_NET_BSD44MSGHDR */
1115
1116         if (write_countp != NULL)
1117                 *write_countp = write_count;
1118 }
1119
1120 /*
1121  * Construct an iov array and attach it to the msghdr passed in.  This is
1122  * the RECV constructor, which will use the avialable region of the buffer
1123  * (if using a buffer list) or will use the internal region (if a single
1124  * buffer I/O is requested).
1125  *
1126  * Nothing can be NULL, and the done event must list at least one buffer
1127  * on the buffer linked list for this function to be meaningful.
1128  *
1129  * If read_countp != NULL, *read_countp will hold the number of bytes
1130  * this transaction can receive.
1131  */
1132 static void
1133 build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
1134                   struct msghdr *msg, struct iovec *iov, size_t *read_countp)
1135 {
1136         unsigned int iovcount;
1137         isc_buffer_t *buffer;
1138         isc_region_t available;
1139         size_t read_count;
1140
1141         memset(msg, 0, sizeof(struct msghdr));
1142
1143         if (sock->type == isc_sockettype_udp) {
1144                 memset(&dev->address, 0, sizeof(dev->address));
1145 #ifdef BROKEN_RECVMSG
1146                 if (sock->pf == AF_INET) {
1147                         msg->msg_name = (void *)&dev->address.type.sin;
1148                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1149                 } else if (sock->pf == AF_INET6) {
1150                         msg->msg_name = (void *)&dev->address.type.sin6;
1151                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1152 #ifdef ISC_PLATFORM_HAVESYSUNH
1153                 } else if (sock->pf == AF_UNIX) {
1154                         msg->msg_name = (void *)&dev->address.type.sunix;
1155                         msg->msg_namelen = sizeof(dev->address.type.sunix);
1156 #endif
1157                 } else {
1158                         msg->msg_name = (void *)&dev->address.type.sa;
1159                         msg->msg_namelen = sizeof(dev->address.type);
1160                 }
1161 #else
1162                 msg->msg_name = (void *)&dev->address.type.sa;
1163                 msg->msg_namelen = sizeof(dev->address.type);
1164 #endif
1165 #ifdef ISC_NET_RECVOVERFLOW
1166                 /* If needed, steal one iovec for overflow detection. */
1167                 maxiov--;
1168 #endif
1169         } else { /* TCP */
1170                 msg->msg_name = NULL;
1171                 msg->msg_namelen = 0;
1172                 dev->address = sock->address;
1173         }
1174
1175         buffer = ISC_LIST_HEAD(dev->bufferlist);
1176         read_count = 0;
1177
1178         /*
1179          * Single buffer I/O?  Skip what we've done so far in this region.
1180          */
1181         if (buffer == NULL) {
1182                 read_count = dev->region.length - dev->n;
1183                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1184                 iov[0].iov_len = read_count;
1185                 iovcount = 1;
1186
1187                 goto config;
1188         }
1189
1190         /*
1191          * Multibuffer I/O.
1192          * Skip empty buffers.
1193          */
1194         while (buffer != NULL) {
1195                 REQUIRE(ISC_BUFFER_VALID(buffer));
1196                 if (isc_buffer_availablelength(buffer) != 0)
1197                         break;
1198                 buffer = ISC_LIST_NEXT(buffer, link);
1199         }
1200
1201         iovcount = 0;
1202         while (buffer != NULL) {
1203                 INSIST(iovcount < MAXSCATTERGATHER_RECV);
1204
1205                 isc_buffer_availableregion(buffer, &available);
1206
1207                 if (available.length > 0) {
1208                         iov[iovcount].iov_base = (void *)(available.base);
1209                         iov[iovcount].iov_len = available.length;
1210                         read_count += available.length;
1211                         iovcount++;
1212                 }
1213                 buffer = ISC_LIST_NEXT(buffer, link);
1214         }
1215
1216  config:
1217
1218         /*
1219          * If needed, set up to receive that one extra byte.  Note that
1220          * we know there is at least one iov left, since we stole it
1221          * at the top of this function.
1222          */
1223 #ifdef ISC_NET_RECVOVERFLOW
1224         if (sock->type == isc_sockettype_udp) {
1225                 iov[iovcount].iov_base = (void *)(&sock->overflow);
1226                 iov[iovcount].iov_len = 1;
1227                 iovcount++;
1228         }
1229 #endif
1230
1231         msg->msg_iov = iov;
1232         msg->msg_iovlen = iovcount;
1233
1234 #ifdef ISC_NET_BSD44MSGHDR
1235         msg->msg_control = NULL;
1236         msg->msg_controllen = 0;
1237         msg->msg_flags = 0;
1238 #if defined(USE_CMSG)
1239         if (sock->type == isc_sockettype_udp) {
1240                 msg->msg_control = sock->recvcmsgbuf;
1241                 msg->msg_controllen = sock->recvcmsgbuflen;
1242         }
1243 #endif /* USE_CMSG */
1244 #else /* ISC_NET_BSD44MSGHDR */
1245         msg->msg_accrights = NULL;
1246         msg->msg_accrightslen = 0;
1247 #endif /* ISC_NET_BSD44MSGHDR */
1248
1249         if (read_countp != NULL)
1250                 *read_countp = read_count;
1251 }
1252
1253 static void
1254 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1255                 isc_socketevent_t *dev)
1256 {
1257         if (sock->type == isc_sockettype_udp) {
1258                 if (address != NULL)
1259                         dev->address = *address;
1260                 else
1261                         dev->address = sock->address;
1262         } else if (sock->type == isc_sockettype_tcp) {
1263                 INSIST(address == NULL);
1264                 dev->address = sock->address;
1265         }
1266 }
1267
1268 static void
1269 destroy_socketevent(isc_event_t *event) {
1270         isc_socketevent_t *ev = (isc_socketevent_t *)event;
1271
1272         INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1273
1274         (ev->destroy)(event);
1275 }
1276
1277 static isc_socketevent_t *
1278 allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1279                      isc_taskaction_t action, const void *arg)
1280 {
1281         isc_socketevent_t *ev;
1282
1283         ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1284                                                      sock, eventtype,
1285                                                      action, arg,
1286                                                      sizeof(*ev));
1287
1288         if (ev == NULL)
1289                 return (NULL);
1290
1291         ev->result = ISC_R_UNEXPECTED;
1292         ISC_LINK_INIT(ev, ev_link);
1293         ISC_LIST_INIT(ev->bufferlist);
1294         ev->region.base = NULL;
1295         ev->n = 0;
1296         ev->offset = 0;
1297         ev->attributes = 0;
1298         ev->destroy = ev->ev_destroy;
1299         ev->ev_destroy = destroy_socketevent;
1300
1301         return (ev);
1302 }
1303
1304 #if defined(ISC_SOCKET_DEBUG)
1305 static void
1306 dump_msg(struct msghdr *msg) {
1307         unsigned int i;
1308
1309         printf("MSGHDR %p\n", msg);
1310         printf("\tname %p, namelen %ld\n", msg->msg_name,
1311                (long) msg->msg_namelen);
1312         printf("\tiov %p, iovlen %ld\n", msg->msg_iov,
1313                (long) msg->msg_iovlen);
1314         for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1315                 printf("\t\t%d\tbase %p, len %ld\n", i,
1316                        msg->msg_iov[i].iov_base,
1317                        (long) msg->msg_iov[i].iov_len);
1318 #ifdef ISC_NET_BSD44MSGHDR
1319         printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1320                (long) msg->msg_controllen);
1321 #endif
1322 }
1323 #endif
1324
1325 #define DOIO_SUCCESS            0       /* i/o ok, event sent */
1326 #define DOIO_SOFT               1       /* i/o ok, soft error, no event sent */
1327 #define DOIO_HARD               2       /* i/o error, event sent */
1328 #define DOIO_EOF                3       /* EOF, no event sent */
1329
1330 static int
1331 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1332         int cc;
1333         struct iovec iov[MAXSCATTERGATHER_RECV];
1334         size_t read_count;
1335         size_t actual_count;
1336         struct msghdr msghdr;
1337         isc_buffer_t *buffer;
1338         int recv_errno;
1339         char strbuf[ISC_STRERRORSIZE];
1340
1341         build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
1342
1343 #if defined(ISC_SOCKET_DEBUG)
1344         dump_msg(&msghdr);
1345 #endif
1346
1347         cc = recvmsg(sock->fd, &msghdr, 0);
1348         recv_errno = errno;
1349
1350 #if defined(ISC_SOCKET_DEBUG)
1351         dump_msg(&msghdr);
1352 #endif
1353
1354         if (cc < 0) {
1355                 if (SOFT_ERROR(recv_errno))
1356                         return (DOIO_SOFT);
1357
1358                 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1359                         isc__strerror(recv_errno, strbuf, sizeof(strbuf));
1360                         socket_log(sock, NULL, IOEVENT,
1361                                    isc_msgcat, ISC_MSGSET_SOCKET,
1362                                    ISC_MSG_DOIORECV,
1363                                   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1364                                    sock->fd, cc, recv_errno, strbuf);
1365                 }
1366
1367 #define SOFT_OR_HARD(_system, _isc) \
1368         if (recv_errno == _system) { \
1369                 if (sock->connected) { \
1370                         dev->result = _isc; \
1371                         return (DOIO_HARD); \
1372                 } \
1373                 return (DOIO_SOFT); \
1374         }
1375 #define ALWAYS_HARD(_system, _isc) \
1376         if (recv_errno == _system) { \
1377                 dev->result = _isc; \
1378                 return (DOIO_HARD); \
1379         }
1380
1381                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1382                 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1383                 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1384                 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1385                 /* HPUX 11.11 can return EADDRNOTAVAIL. */
1386                 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1387                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1388                 /*
1389                  * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
1390                  * errors.
1391                  */
1392 #ifdef EPROTO
1393                 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1394 #endif
1395                 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1396
1397 #undef SOFT_OR_HARD
1398 #undef ALWAYS_HARD
1399
1400                 dev->result = isc__errno2result(recv_errno);
1401                 return (DOIO_HARD);
1402         }
1403
1404         /*
1405          * On TCP, zero length reads indicate EOF, while on
1406          * UDP, zero length reads are perfectly valid, although
1407          * strange.
1408          */
1409         if ((sock->type == isc_sockettype_tcp) && (cc == 0))
1410                 return (DOIO_EOF);
1411
1412         if (sock->type == isc_sockettype_udp) {
1413                 dev->address.length = msghdr.msg_namelen;
1414                 if (isc_sockaddr_getport(&dev->address) == 0) {
1415                         if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1416                                 socket_log(sock, &dev->address, IOEVENT,
1417                                            isc_msgcat, ISC_MSGSET_SOCKET,
1418                                            ISC_MSG_ZEROPORT,
1419                                            "dropping source port zero packet");
1420                         }
1421                         return (DOIO_SOFT);
1422                 }
1423         }
1424
1425         socket_log(sock, &dev->address, IOEVENT,
1426                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1427                    "packet received correctly");
1428
1429         /*
1430          * Overflow bit detection.  If we received MORE bytes than we should,
1431          * this indicates an overflow situation.  Set the flag in the
1432          * dev entry and adjust how much we read by one.
1433          */
1434 #ifdef ISC_NET_RECVOVERFLOW
1435         if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1436                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1437                 cc--;
1438         }
1439 #endif
1440
1441         /*
1442          * If there are control messages attached, run through them and pull
1443          * out the interesting bits.
1444          */
1445         if (sock->type == isc_sockettype_udp)
1446                 process_cmsg(sock, &msghdr, dev);
1447
1448         /*
1449          * update the buffers (if any) and the i/o count
1450          */
1451         dev->n += cc;
1452         actual_count = cc;
1453         buffer = ISC_LIST_HEAD(dev->bufferlist);
1454         while (buffer != NULL && actual_count > 0U) {
1455                 REQUIRE(ISC_BUFFER_VALID(buffer));
1456                 if (isc_buffer_availablelength(buffer) <= actual_count) {
1457                         actual_count -= isc_buffer_availablelength(buffer);
1458                         isc_buffer_add(buffer,
1459                                        isc_buffer_availablelength(buffer));
1460                 } else {
1461                         isc_buffer_add(buffer, actual_count);
1462                         actual_count = 0;
1463                         break;
1464                 }
1465                 buffer = ISC_LIST_NEXT(buffer, link);
1466                 if (buffer == NULL) {
1467                         INSIST(actual_count == 0U);
1468                 }
1469         }
1470
1471         /*
1472          * If we read less than we expected, update counters,
1473          * and let the upper layer poke the descriptor.
1474          */
1475         if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1476                 return (DOIO_SOFT);
1477
1478         /*
1479          * Full reads are posted, or partials if partials are ok.
1480          */
1481         dev->result = ISC_R_SUCCESS;
1482         return (DOIO_SUCCESS);
1483 }
1484
1485 /*
1486  * Returns:
1487  *      DOIO_SUCCESS    The operation succeeded.  dev->result contains
1488  *                      ISC_R_SUCCESS.
1489  *
1490  *      DOIO_HARD       A hard or unexpected I/O error was encountered.
1491  *                      dev->result contains the appropriate error.
1492  *
1493  *      DOIO_SOFT       A soft I/O error was encountered.  No senddone
1494  *                      event was sent.  The operation should be retried.
1495  *
1496  *      No other return values are possible.
1497  */
1498 static int
1499 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1500         int cc;
1501         struct iovec iov[MAXSCATTERGATHER_SEND];
1502         size_t write_count;
1503         struct msghdr msghdr;
1504         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1505         int attempts = 0;
1506         int send_errno;
1507         char strbuf[ISC_STRERRORSIZE];
1508
1509         build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1510
1511  resend:
1512         cc = sendmsg(sock->fd, &msghdr, 0);
1513         send_errno = errno;
1514
1515         /*
1516          * Check for error or block condition.
1517          */
1518         if (cc < 0) {
1519                 if (send_errno == EINTR && ++attempts < NRETRIES)
1520                         goto resend;
1521
1522                 if (SOFT_ERROR(send_errno))
1523                         return (DOIO_SOFT);
1524
1525 #define SOFT_OR_HARD(_system, _isc) \
1526         if (send_errno == _system) { \
1527                 if (sock->connected) { \
1528                         dev->result = _isc; \
1529                         return (DOIO_HARD); \
1530                 } \
1531                 return (DOIO_SOFT); \
1532         }
1533 #define ALWAYS_HARD(_system, _isc) \
1534         if (send_errno == _system) { \
1535                 dev->result = _isc; \
1536                 return (DOIO_HARD); \
1537         }
1538
1539                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1540                 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1541                 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1542                 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1543                 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1544 #ifdef EHOSTDOWN
1545                 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1546 #endif
1547                 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1548                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1549                 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1550                 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1551                 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1552
1553 #undef SOFT_OR_HARD
1554 #undef ALWAYS_HARD
1555
1556                 /*
1557                  * The other error types depend on whether or not the
1558                  * socket is UDP or TCP.  If it is UDP, some errors
1559                  * that we expect to be fatal under TCP are merely
1560                  * annoying, and are really soft errors.
1561                  *
1562                  * However, these soft errors are still returned as
1563                  * a status.
1564                  */
1565                 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1566                 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1567                 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1568                                  addrbuf, strbuf);
1569                 dev->result = isc__errno2result(send_errno);
1570                 return (DOIO_HARD);
1571         }
1572
1573         if (cc == 0)
1574                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1575                                  "internal_send: send() %s 0",
1576                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1577                                                 ISC_MSG_RETURNED, "returned"));
1578
1579         /*
1580          * If we write less than we expected, update counters, poke.
1581          */
1582         dev->n += cc;
1583         if ((size_t)cc != write_count)
1584                 return (DOIO_SOFT);
1585
1586         /*
1587          * Exactly what we wanted to write.  We're done with this
1588          * entry.  Post its completion event.
1589          */
1590         dev->result = ISC_R_SUCCESS;
1591         return (DOIO_SUCCESS);
1592 }
1593
1594 /*
1595  * Kill.
1596  *
1597  * Caller must ensure that the socket is not locked and no external
1598  * references exist.
1599  */
1600 static void
1601 closesocket(isc_socketmgr_t *manager, isc_sockettype_t type, int fd) {
1602         int lockid = FDLOCK_ID(fd);
1603
1604         UNUSED(type);
1605
1606         /*
1607          * No one has this socket open, so the watcher doesn't have to be
1608          * poked, and the socket doesn't have to be locked.
1609          */
1610         LOCK(&manager->fdlock[lockid]);
1611         manager->fds[fd] = NULL;
1612         manager->fdstate[fd] = CLOSE_PENDING;
1613         UNLOCK(&manager->fdlock[lockid]);
1614         select_poke(manager, fd, SELECT_POKE_CLOSE);
1615
1616         /*
1617          * update manager->maxfd here (XXX: this should be implemented more
1618          * efficiently)
1619          */
1620 #ifdef USE_SELECT
1621         LOCK(&manager->lock);
1622         if (manager->maxfd == fd) {
1623                 int i;
1624
1625                 manager->maxfd = 0;
1626                 for (i = fd - 1; i >= 0; i--) {
1627                         lockid = FDLOCK_ID(i);
1628
1629                         LOCK(&manager->fdlock[lockid]);
1630                         if (manager->fdstate[i] == MANAGED) {
1631                                 manager->maxfd = i;
1632                                 UNLOCK(&manager->fdlock[lockid]);
1633                                 break;
1634                         }
1635                         UNLOCK(&manager->fdlock[lockid]);
1636                 }
1637 #ifdef ISC_PLATFORM_USETHREADS
1638                 if (manager->maxfd < manager->pipe_fds[0])
1639                         manager->maxfd = manager->pipe_fds[0];
1640 #endif
1641         }
1642         UNLOCK(&manager->lock);
1643 #endif  /* USE_SELECT */
1644 }
1645
1646 static void
1647 destroy(isc_socket_t **sockp) {
1648         int fd;
1649         isc_socket_t *sock = *sockp;
1650         isc_socketmgr_t *manager = sock->manager;
1651
1652         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1653                    ISC_MSG_DESTROYING, "destroying");
1654
1655         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1656         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1657         INSIST(ISC_LIST_EMPTY(sock->send_list));
1658         INSIST(sock->connect_ev == NULL);
1659         REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks);
1660
1661         if (sock->fd >= 0) {
1662                 fd = sock->fd;
1663                 sock->fd = -1;
1664                 closesocket(manager, sock->type, fd);
1665         }
1666
1667         LOCK(&manager->lock);
1668
1669         ISC_LIST_UNLINK(manager->socklist, sock, link);
1670
1671 #ifdef ISC_PLATFORM_USETHREADS
1672         if (ISC_LIST_EMPTY(manager->socklist))
1673                 SIGNAL(&manager->shutdown_ok);
1674 #endif /* ISC_PLATFORM_USETHREADS */
1675
1676         UNLOCK(&manager->lock);
1677
1678         free_socket(sockp);
1679 }
1680
1681 static isc_result_t
1682 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1683                 isc_socket_t **socketp)
1684 {
1685         isc_socket_t *sock;
1686         isc_result_t result;
1687         ISC_SOCKADDR_LEN_T cmsgbuflen;
1688
1689         sock = isc_mem_get(manager->mctx, sizeof(*sock));
1690
1691         if (sock == NULL)
1692                 return (ISC_R_NOMEMORY);
1693
1694         result = ISC_R_UNEXPECTED;
1695
1696         sock->magic = 0;
1697         sock->references = 0;
1698
1699         sock->manager = manager;
1700         sock->type = type;
1701         sock->fd = -1;
1702
1703         ISC_LINK_INIT(sock, link);
1704
1705         sock->recvcmsgbuf = NULL;
1706         sock->sendcmsgbuf = NULL;
1707
1708         /*
1709          * set up cmsg buffers
1710          */
1711         cmsgbuflen = 0;
1712 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1713         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1714 #endif
1715 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
1716         cmsgbuflen += cmsg_space(sizeof(struct timeval));
1717 #endif
1718         sock->recvcmsgbuflen = cmsgbuflen;
1719         if (sock->recvcmsgbuflen != 0U) {
1720                 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1721                 if (sock->recvcmsgbuf == NULL)
1722                         goto error;
1723         }
1724
1725         cmsgbuflen = 0;
1726 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1727         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1728 #endif
1729         sock->sendcmsgbuflen = cmsgbuflen;
1730         if (sock->sendcmsgbuflen != 0U) {
1731                 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1732                 if (sock->sendcmsgbuf == NULL)
1733                         goto error;
1734         }
1735
1736         /*
1737          * set up list of readers and writers to be initially empty
1738          */
1739         ISC_LIST_INIT(sock->recv_list);
1740         ISC_LIST_INIT(sock->send_list);
1741         ISC_LIST_INIT(sock->accept_list);
1742         sock->connect_ev = NULL;
1743         sock->pending_recv = 0;
1744         sock->pending_send = 0;
1745         sock->pending_accept = 0;
1746         sock->listener = 0;
1747         sock->connected = 0;
1748         sock->connecting = 0;
1749         sock->bound = 0;
1750
1751         /*
1752          * initialize the lock
1753          */
1754         result = isc_mutex_init(&sock->lock);
1755         if (result != ISC_R_SUCCESS) {
1756                 sock->magic = 0;
1757                 goto error;
1758         }
1759
1760         /*
1761          * Initialize readable and writable events
1762          */
1763         ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1764                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1765                        NULL, sock, sock, NULL, NULL);
1766         ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1767                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1768                        NULL, sock, sock, NULL, NULL);
1769
1770         sock->magic = SOCKET_MAGIC;
1771         *socketp = sock;
1772
1773         return (ISC_R_SUCCESS);
1774
1775  error:
1776         if (sock->recvcmsgbuf != NULL)
1777                 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
1778                             sock->recvcmsgbuflen);
1779         if (sock->sendcmsgbuf != NULL)
1780                 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
1781                             sock->sendcmsgbuflen);
1782         isc_mem_put(manager->mctx, sock, sizeof(*sock));
1783
1784         return (result);
1785 }
1786
1787 /*
1788  * This event requires that the various lists be empty, that the reference
1789  * count be 1, and that the magic number is valid.  The other socket bits,
1790  * like the lock, must be initialized as well.  The fd associated must be
1791  * marked as closed, by setting it to -1 on close, or this routine will
1792  * also close the socket.
1793  */
1794 static void
1795 free_socket(isc_socket_t **socketp) {
1796         isc_socket_t *sock = *socketp;
1797
1798         INSIST(sock->references == 0);
1799         INSIST(VALID_SOCKET(sock));
1800         INSIST(!sock->connecting);
1801         INSIST(!sock->pending_recv);
1802         INSIST(!sock->pending_send);
1803         INSIST(!sock->pending_accept);
1804         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1805         INSIST(ISC_LIST_EMPTY(sock->send_list));
1806         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1807         INSIST(!ISC_LINK_LINKED(sock, link));
1808
1809         if (sock->recvcmsgbuf != NULL)
1810                 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
1811                             sock->recvcmsgbuflen);
1812         if (sock->sendcmsgbuf != NULL)
1813                 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
1814                             sock->sendcmsgbuflen);
1815
1816         sock->magic = 0;
1817
1818         DESTROYLOCK(&sock->lock);
1819
1820         isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1821
1822         *socketp = NULL;
1823 }
1824
1825 #ifdef SO_BSDCOMPAT
1826 /*
1827  * This really should not be necessary to do.  Having to workout
1828  * which kernel version we are on at run time so that we don't cause
1829  * the kernel to issue a warning about us using a deprecated socket option.
1830  * Such warnings should *never* be on by default in production kernels.
1831  *
1832  * We can't do this a build time because executables are moved between
1833  * machines and hence kernels.
1834  *
1835  * We can't just not set SO_BSDCOMAT because some kernels require it.
1836  */
1837
1838 static isc_once_t         bsdcompat_once = ISC_ONCE_INIT;
1839 isc_boolean_t bsdcompat = ISC_TRUE;
1840
1841 static void
1842 clear_bsdcompat(void) {
1843 #ifdef __linux__
1844          struct utsname buf;
1845          char *endp;
1846          long int major;
1847          long int minor;
1848
1849          uname(&buf);    /* Can only fail if buf is bad in Linux. */
1850
1851          /* Paranoia in parsing can be increased, but we trust uname(). */
1852          major = strtol(buf.release, &endp, 10);
1853          if (*endp == '.') {
1854                 minor = strtol(endp+1, &endp, 10);
1855                 if ((major > 2) || ((major == 2) && (minor >= 4))) {
1856                         bsdcompat = ISC_FALSE;
1857                 }
1858          }
1859 #endif /* __linux __ */
1860 }
1861 #endif
1862
1863 static isc_result_t
1864 opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) {
1865         char strbuf[ISC_STRERRORSIZE];
1866         const char *err = "socket";
1867         int tries = 0;
1868 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
1869         int on = 1;
1870 #endif
1871 #if defined(SO_RCVBUF)
1872         ISC_SOCKADDR_LEN_T optlen;
1873         int size;
1874 #endif
1875
1876  again:
1877         switch (sock->type) {
1878         case isc_sockettype_udp:
1879                 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
1880                 break;
1881         case isc_sockettype_tcp:
1882                 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
1883                 break;
1884         case isc_sockettype_unix:
1885                 sock->fd = socket(sock->pf, SOCK_STREAM, 0);
1886                 break;
1887         }
1888         if (sock->fd == -1 && errno == EINTR && tries++ < 42)
1889                 goto again;
1890
1891 #ifdef F_DUPFD
1892         /*
1893          * Leave a space for stdio and TCP to work in.
1894          */
1895         if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
1896             sock->fd >= 0 && sock->fd < manager->reserved) {
1897                 int new, tmp;
1898                 new = fcntl(sock->fd, F_DUPFD, manager->reserved);
1899                 tmp = errno;
1900                 (void)close(sock->fd);
1901                 errno = tmp;
1902                 sock->fd = new;
1903                 err = "isc_socket_create: fcntl/reserved";
1904         } else if (sock->fd >= 0 && sock->fd < 20) {
1905                 int new, tmp;
1906                 new = fcntl(sock->fd, F_DUPFD, 20);
1907                 tmp = errno;
1908                 (void)close(sock->fd);
1909                 errno = tmp;
1910                 sock->fd = new;
1911                 err = "isc_socket_create: fcntl";
1912         }
1913 #endif
1914
1915         if (sock->fd >= (int)manager->maxsocks) {
1916                 (void)close(sock->fd);
1917                 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1918                                ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1919                                isc_msgcat, ISC_MSGSET_SOCKET,
1920                                ISC_MSG_TOOMANYFDS,
1921                                "socket: file descriptor exceeds limit (%d/%u)",
1922                                sock->fd, manager->maxsocks);
1923                 return (ISC_R_NORESOURCES);
1924         }
1925
1926         if (sock->fd < 0) {
1927                 switch (errno) {
1928                 case EMFILE:
1929                 case ENFILE:
1930                 case ENOBUFS:
1931                         return (ISC_R_NORESOURCES);
1932
1933                 case EPROTONOSUPPORT:
1934                 case EPFNOSUPPORT:
1935                 case EAFNOSUPPORT:
1936                 /*
1937                  * Linux 2.2 (and maybe others) return EINVAL instead of
1938                  * EAFNOSUPPORT.
1939                  */
1940                 case EINVAL:
1941                         return (ISC_R_FAMILYNOSUPPORT);
1942
1943                 default:
1944                         isc__strerror(errno, strbuf, sizeof(strbuf));
1945                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1946                                          "%s() %s: %s", err,
1947                                          isc_msgcat_get(isc_msgcat,
1948                                                         ISC_MSGSET_GENERAL,
1949                                                         ISC_MSG_FAILED,
1950                                                         "failed"),
1951                                          strbuf);
1952                         return (ISC_R_UNEXPECTED);
1953                 }
1954         }
1955
1956         if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
1957                 (void)close(sock->fd);
1958                 return (ISC_R_UNEXPECTED);
1959         }
1960
1961 #ifdef SO_BSDCOMPAT
1962         RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
1963                                   clear_bsdcompat) == ISC_R_SUCCESS);
1964         if (sock->type != isc_sockettype_unix && bsdcompat &&
1965             setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
1966                        (void *)&on, sizeof(on)) < 0) {
1967                 isc__strerror(errno, strbuf, sizeof(strbuf));
1968                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1969                                  "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
1970                                  sock->fd,
1971                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1972                                                 ISC_MSG_FAILED, "failed"),
1973                                  strbuf);
1974                 /* Press on... */
1975         }
1976 #endif
1977
1978 #ifdef SO_NOSIGPIPE
1979         if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE,
1980                        (void *)&on, sizeof(on)) < 0) {
1981                 isc__strerror(errno, strbuf, sizeof(strbuf));
1982                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1983                                  "setsockopt(%d, SO_NOSIGPIPE) %s: %s",
1984                                  sock->fd,
1985                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1986                                                 ISC_MSG_FAILED, "failed"),
1987                                  strbuf);
1988                 /* Press on... */
1989         }
1990 #endif
1991
1992 #if defined(USE_CMSG) || defined(SO_RCVBUF)
1993         if (sock->type == isc_sockettype_udp) {
1994
1995 #if defined(USE_CMSG)
1996 #if defined(SO_TIMESTAMP)
1997                 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1998                                (void *)&on, sizeof(on)) < 0
1999                     && errno != ENOPROTOOPT) {
2000                         isc__strerror(errno, strbuf, sizeof(strbuf));
2001                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2002                                          "setsockopt(%d, SO_TIMESTAMP) %s: %s",
2003                                          sock->fd,
2004                                          isc_msgcat_get(isc_msgcat,
2005                                                         ISC_MSGSET_GENERAL,
2006                                                         ISC_MSG_FAILED,
2007                                                         "failed"),
2008                                          strbuf);
2009                         /* Press on... */
2010                 }
2011 #endif /* SO_TIMESTAMP */
2012
2013 #if defined(ISC_PLATFORM_HAVEIPV6)
2014                 if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
2015                         /*
2016                          * Warn explicitly because this anomaly can be hidden
2017                          * in usual operation (and unexpectedly appear later).
2018                          */
2019                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2020                                          "No buffer available to receive "
2021                                          "IPv6 destination");
2022                 }
2023 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
2024 #ifdef IPV6_RECVPKTINFO
2025                 /* RFC 3542 */
2026                 if ((sock->pf == AF_INET6)
2027                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2028                                    (void *)&on, sizeof(on)) < 0)) {
2029                         isc__strerror(errno, strbuf, sizeof(strbuf));
2030                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2031                                          "setsockopt(%d, IPV6_RECVPKTINFO) "
2032                                          "%s: %s", sock->fd,
2033                                          isc_msgcat_get(isc_msgcat,
2034                                                         ISC_MSGSET_GENERAL,
2035                                                         ISC_MSG_FAILED,
2036                                                         "failed"),
2037                                          strbuf);
2038                 }
2039 #else
2040                 /* RFC 2292 */
2041                 if ((sock->pf == AF_INET6)
2042                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2043                                    (void *)&on, sizeof(on)) < 0)) {
2044                         isc__strerror(errno, strbuf, sizeof(strbuf));
2045                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2046                                          "setsockopt(%d, IPV6_PKTINFO) %s: %s",
2047                                          sock->fd,
2048                                          isc_msgcat_get(isc_msgcat,
2049                                                         ISC_MSGSET_GENERAL,
2050                                                         ISC_MSG_FAILED,
2051                                                         "failed"),
2052                                          strbuf);
2053                 }
2054 #endif /* IPV6_RECVPKTINFO */
2055 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
2056 #ifdef IPV6_USE_MIN_MTU        /* RFC 3542, not too common yet*/
2057                 /* use minimum MTU */
2058                 if (sock->pf == AF_INET6) {
2059                         (void)setsockopt(sock->fd, IPPROTO_IPV6,
2060                                          IPV6_USE_MIN_MTU,
2061                                          (void *)&on, sizeof(on));
2062                 }
2063 #endif
2064 #endif /* ISC_PLATFORM_HAVEIPV6 */
2065 #endif /* defined(USE_CMSG) */
2066
2067 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
2068                 /*
2069                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2070                  */
2071                 if (sock->pf == AF_INET) {
2072                         int action = IP_PMTUDISC_DONT;
2073                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2074                                          &action, sizeof(action));
2075                 }
2076 #endif
2077 #if defined(IP_DONTFRAG)
2078                 /*
2079                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2080                  */
2081                 if (sock->pf == AF_INET) {
2082                         int off = 0;
2083                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG,
2084                                          &off, sizeof(off));
2085                 }
2086 #endif
2087
2088 #if defined(SO_RCVBUF)
2089                 optlen = sizeof(size);
2090                 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2091                                (void *)&size, &optlen) >= 0 &&
2092                      size < RCVBUFSIZE) {
2093                         size = RCVBUFSIZE;
2094                         if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2095                                        (void *)&size, sizeof(size)) == -1) {
2096                                 isc__strerror(errno, strbuf, sizeof(strbuf));
2097                                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2098                                         "setsockopt(%d, SO_RCVBUF, %d) %s: %s",
2099                                         sock->fd, size,
2100                                         isc_msgcat_get(isc_msgcat,
2101                                                        ISC_MSGSET_GENERAL,
2102                                                        ISC_MSG_FAILED,
2103                                                        "failed"),
2104                                         strbuf);
2105                         }
2106                 }
2107 #endif
2108         }
2109 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
2110
2111         return (ISC_R_SUCCESS);
2112 }
2113
2114 /*%
2115  * Create a new 'type' socket managed by 'manager'.  Events
2116  * will be posted to 'task' and when dispatched 'action' will be
2117  * called with 'arg' as the arg value.  The new socket is returned
2118  * in 'socketp'.
2119  */
2120 isc_result_t
2121 isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
2122                   isc_socket_t **socketp)
2123 {
2124         isc_socket_t *sock = NULL;
2125         isc_result_t result;
2126         int lockid;
2127
2128         REQUIRE(VALID_MANAGER(manager));
2129         REQUIRE(socketp != NULL && *socketp == NULL);
2130
2131         result = allocate_socket(manager, type, &sock);
2132         if (result != ISC_R_SUCCESS)
2133                 return (result);
2134
2135         sock->pf = pf;
2136         result = opensocket(manager, sock);
2137         if (result != ISC_R_SUCCESS) {
2138                 free_socket(&sock);
2139                 return (result);
2140         }
2141
2142         sock->references = 1;
2143         *socketp = sock;
2144
2145         /*
2146          * Note we don't have to lock the socket like we normally would because
2147          * there are no external references to it yet.
2148          */
2149
2150         lockid = FDLOCK_ID(sock->fd);
2151         LOCK(&manager->fdlock[lockid]);
2152         manager->fds[sock->fd] = sock;
2153         manager->fdstate[sock->fd] = MANAGED;
2154 #ifdef USE_DEVPOLL
2155         INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2156                sock->manager->fdpollinfo[sock->fd].want_write == 0);
2157 #endif
2158         UNLOCK(&manager->fdlock[lockid]);
2159
2160         LOCK(&manager->lock);
2161         ISC_LIST_APPEND(manager->socklist, sock, link);
2162 #ifdef USE_SELECT
2163         if (manager->maxfd < sock->fd)
2164                 manager->maxfd = sock->fd;
2165 #endif
2166         UNLOCK(&manager->lock);
2167
2168         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2169                    ISC_MSG_CREATED, "created");
2170
2171         return (ISC_R_SUCCESS);
2172 }
2173
2174 isc_result_t
2175 isc_socket_open(isc_socket_t *sock) {
2176         isc_result_t result;
2177
2178         REQUIRE(VALID_SOCKET(sock));
2179
2180         LOCK(&sock->lock);
2181         REQUIRE(sock->references == 1);
2182         UNLOCK(&sock->lock);
2183         /*
2184          * We don't need to retain the lock hereafter, since no one else has
2185          * this socket.
2186          */
2187         REQUIRE(sock->fd == -1);
2188
2189         result = opensocket(sock->manager, sock);
2190         if (result != ISC_R_SUCCESS)
2191                 sock->fd = -1;
2192
2193         if (result == ISC_R_SUCCESS) {
2194                 int lockid = FDLOCK_ID(sock->fd);
2195
2196                 LOCK(&sock->manager->fdlock[lockid]);
2197                 sock->manager->fds[sock->fd] = sock;
2198                 sock->manager->fdstate[sock->fd] = MANAGED;
2199 #ifdef USE_DEVPOLL
2200                 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2201                        sock->manager->fdpollinfo[sock->fd].want_write == 0);
2202 #endif
2203                 UNLOCK(&sock->manager->fdlock[lockid]);
2204
2205 #ifdef USE_SELECT
2206                 LOCK(&sock->manager->lock);
2207                 if (sock->manager->maxfd < sock->fd)
2208                         sock->manager->maxfd = sock->fd;
2209                 UNLOCK(&sock->manager->lock);
2210 #endif
2211         }
2212
2213         return (result);
2214 }
2215
2216 /*
2217  * Attach to a socket.  Caller must explicitly detach when it is done.
2218  */
2219 void
2220 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
2221         REQUIRE(VALID_SOCKET(sock));
2222         REQUIRE(socketp != NULL && *socketp == NULL);
2223
2224         LOCK(&sock->lock);
2225         sock->references++;
2226         UNLOCK(&sock->lock);
2227
2228         *socketp = sock;
2229 }
2230
2231 /*
2232  * Dereference a socket.  If this is the last reference to it, clean things
2233  * up by destroying the socket.
2234  */
2235 void
2236 isc_socket_detach(isc_socket_t **socketp) {
2237         isc_socket_t *sock;
2238         isc_boolean_t kill_socket = ISC_FALSE;
2239
2240         REQUIRE(socketp != NULL);
2241         sock = *socketp;
2242         REQUIRE(VALID_SOCKET(sock));
2243
2244         LOCK(&sock->lock);
2245         REQUIRE(sock->references > 0);
2246         sock->references--;
2247         if (sock->references == 0)
2248                 kill_socket = ISC_TRUE;
2249         UNLOCK(&sock->lock);
2250
2251         if (kill_socket)
2252                 destroy(&sock);
2253
2254         *socketp = NULL;
2255 }
2256
2257 isc_result_t
2258 isc_socket_close(isc_socket_t *sock) {
2259         int fd;
2260
2261         REQUIRE(VALID_SOCKET(sock));
2262
2263         LOCK(&sock->lock);
2264         REQUIRE(sock->references == 1);
2265         UNLOCK(&sock->lock);
2266         /*
2267          * We don't need to retain the lock hereafter, since no one else has
2268          * this socket.
2269          */
2270
2271         REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2272
2273         INSIST(!sock->connecting);
2274         INSIST(!sock->pending_recv);
2275         INSIST(!sock->pending_send);
2276         INSIST(!sock->pending_accept);
2277         INSIST(ISC_LIST_EMPTY(sock->recv_list));
2278         INSIST(ISC_LIST_EMPTY(sock->send_list));
2279         INSIST(ISC_LIST_EMPTY(sock->accept_list));
2280         INSIST(sock->connect_ev == NULL);
2281
2282         fd = sock->fd;
2283         sock->fd = -1;
2284         sock->listener = 0;
2285         sock->connected = 0;
2286         sock->connecting = 0;
2287         sock->bound = 0;
2288         isc_sockaddr_any(&sock->address);
2289
2290         closesocket(sock->manager, sock->type, fd);
2291
2292         return (ISC_R_SUCCESS);
2293 }
2294
2295 /*
2296  * I/O is possible on a given socket.  Schedule an event to this task that
2297  * will call an internal function to do the I/O.  This will charge the
2298  * task with the I/O operation and let our select loop handler get back
2299  * to doing something real as fast as possible.
2300  *
2301  * The socket and manager must be locked before calling this function.
2302  */
2303 static void
2304 dispatch_recv(isc_socket_t *sock) {
2305         intev_t *iev;
2306         isc_socketevent_t *ev;
2307
2308         INSIST(!sock->pending_recv);
2309
2310         ev = ISC_LIST_HEAD(sock->recv_list);
2311         if (ev == NULL)
2312                 return;
2313
2314         sock->pending_recv = 1;
2315         iev = &sock->readable_ev;
2316
2317         socket_log(sock, NULL, EVENT, NULL, 0, 0,
2318                    "dispatch_recv:  event %p -> task %p", ev, ev->ev_sender);
2319
2320         sock->references++;
2321         iev->ev_sender = sock;
2322         iev->ev_action = internal_recv;
2323         iev->ev_arg = sock;
2324
2325         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2326 }
2327
2328 static void
2329 dispatch_send(isc_socket_t *sock) {
2330         intev_t *iev;
2331         isc_socketevent_t *ev;
2332
2333         INSIST(!sock->pending_send);
2334
2335         ev = ISC_LIST_HEAD(sock->send_list);
2336         if (ev == NULL)
2337                 return;
2338
2339         sock->pending_send = 1;
2340         iev = &sock->writable_ev;
2341
2342         socket_log(sock, NULL, EVENT, NULL, 0, 0,
2343                    "dispatch_send:  event %p -> task %p", ev, ev->ev_sender);
2344
2345         sock->references++;
2346         iev->ev_sender = sock;
2347         iev->ev_action = internal_send;
2348         iev->ev_arg = sock;
2349
2350         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2351 }
2352
2353 /*
2354  * Dispatch an internal accept event.
2355  */
2356 static void
2357 dispatch_accept(isc_socket_t *sock) {
2358         intev_t *iev;
2359         isc_socket_newconnev_t *ev;
2360
2361         INSIST(!sock->pending_accept);
2362
2363         /*
2364          * Are there any done events left, or were they all canceled
2365          * before the manager got the socket lock?
2366          */
2367         ev = ISC_LIST_HEAD(sock->accept_list);
2368         if (ev == NULL)
2369                 return;
2370
2371         sock->pending_accept = 1;
2372         iev = &sock->readable_ev;
2373
2374         sock->references++;  /* keep socket around for this internal event */
2375         iev->ev_sender = sock;
2376         iev->ev_action = internal_accept;
2377         iev->ev_arg = sock;
2378
2379         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2380 }
2381
2382 static void
2383 dispatch_connect(isc_socket_t *sock) {
2384         intev_t *iev;
2385         isc_socket_connev_t *ev;
2386
2387         iev = &sock->writable_ev;
2388
2389         ev = sock->connect_ev;
2390         INSIST(ev != NULL); /* XXX */
2391
2392         INSIST(sock->connecting);
2393
2394         sock->references++;  /* keep socket around for this internal event */
2395         iev->ev_sender = sock;
2396         iev->ev_action = internal_connect;
2397         iev->ev_arg = sock;
2398
2399         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2400 }
2401
2402 /*
2403  * Dequeue an item off the given socket's read queue, set the result code
2404  * in the done event to the one provided, and send it to the task it was
2405  * destined for.
2406  *
2407  * If the event to be sent is on a list, remove it before sending.  If
2408  * asked to, send and detach from the socket as well.
2409  *
2410  * Caller must have the socket locked if the event is attached to the socket.
2411  */
2412 static void
2413 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2414         isc_task_t *task;
2415
2416         task = (*dev)->ev_sender;
2417
2418         (*dev)->ev_sender = sock;
2419
2420         if (ISC_LINK_LINKED(*dev, ev_link))
2421                 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2422
2423         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
2424             == ISC_SOCKEVENTATTR_ATTACHED)
2425                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
2426         else
2427                 isc_task_send(task, (isc_event_t **)dev);
2428 }
2429
2430 /*
2431  * See comments for send_recvdone_event() above.
2432  *
2433  * Caller must have the socket locked if the event is attached to the socket.
2434  */
2435 static void
2436 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2437         isc_task_t *task;
2438
2439         INSIST(dev != NULL && *dev != NULL);
2440
2441         task = (*dev)->ev_sender;
2442         (*dev)->ev_sender = sock;
2443
2444         if (ISC_LINK_LINKED(*dev, ev_link))
2445                 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
2446
2447         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
2448             == ISC_SOCKEVENTATTR_ATTACHED)
2449                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
2450         else
2451                 isc_task_send(task, (isc_event_t **)dev);
2452 }
2453
2454 /*
2455  * Call accept() on a socket, to get the new file descriptor.  The listen
2456  * socket is used as a prototype to create a new isc_socket_t.  The new
2457  * socket has one outstanding reference.  The task receiving the event
2458  * will be detached from just after the event is delivered.
2459  *
2460  * On entry to this function, the event delivered is the internal
2461  * readable event, and the first item on the accept_list should be
2462  * the done event we want to send.  If the list is empty, this is a no-op,
2463  * so just unlock and return.
2464  */
2465 static void
2466 internal_accept(isc_task_t *me, isc_event_t *ev) {
2467         isc_socket_t *sock;
2468         isc_socketmgr_t *manager;
2469         isc_socket_newconnev_t *dev;
2470         isc_task_t *task;
2471         ISC_SOCKADDR_LEN_T addrlen;
2472         int fd;
2473         isc_result_t result = ISC_R_SUCCESS;
2474         char strbuf[ISC_STRERRORSIZE];
2475         const char *err = "accept";
2476
2477         UNUSED(me);
2478
2479         sock = ev->ev_sender;
2480         INSIST(VALID_SOCKET(sock));
2481
2482         LOCK(&sock->lock);
2483         socket_log(sock, NULL, TRACE,
2484                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2485                    "internal_accept called, locked socket");
2486
2487         manager = sock->manager;
2488         INSIST(VALID_MANAGER(manager));
2489
2490         INSIST(sock->listener);
2491         INSIST(sock->pending_accept == 1);
2492         sock->pending_accept = 0;
2493
2494         INSIST(sock->references > 0);
2495         sock->references--;  /* the internal event is done with this socket */
2496         if (sock->references == 0) {
2497                 UNLOCK(&sock->lock);
2498                 destroy(&sock);
2499                 return;
2500         }
2501
2502         /*
2503          * Get the first item off the accept list.
2504          * If it is empty, unlock the socket and return.
2505          */
2506         dev = ISC_LIST_HEAD(sock->accept_list);
2507         if (dev == NULL) {
2508                 UNLOCK(&sock->lock);
2509                 return;
2510         }
2511
2512         /*
2513          * Try to accept the new connection.  If the accept fails with
2514          * EAGAIN or EINTR, simply poke the watcher to watch this socket
2515          * again.  Also ignore ECONNRESET, which has been reported to
2516          * be spuriously returned on Linux 2.2.19 although it is not
2517          * a documented error for accept().  ECONNABORTED has been
2518          * reported for Solaris 8.  The rest are thrown in not because
2519          * we have seen them but because they are ignored by other
2520          * deamons such as BIND 8 and Apache.
2521          */
2522
2523         addrlen = sizeof(dev->newsocket->address.type);
2524         memset(&dev->newsocket->address.type, 0, addrlen);
2525         fd = accept(sock->fd, &dev->newsocket->address.type.sa,
2526                     (void *)&addrlen);
2527
2528 #ifdef F_DUPFD
2529         /*
2530          * Leave a space for stdio to work in.
2531          */
2532         if (fd >= 0 && fd < 20) {
2533                 int new, tmp;
2534                 new = fcntl(fd, F_DUPFD, 20);
2535                 tmp = errno;
2536                 (void)close(fd);
2537                 errno = tmp;
2538                 fd = new;
2539                 err = "accept/fcntl";
2540         }
2541 #endif
2542
2543         if (fd < 0) {
2544                 if (SOFT_ERROR(errno))
2545                         goto soft_error;
2546                 switch (errno) {
2547                 case ENFILE:
2548                 case EMFILE:
2549                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2550                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2551                                        isc_msgcat, ISC_MSGSET_SOCKET,
2552                                        ISC_MSG_TOOMANYFDS,
2553                                        "%s: too many open file descriptors",
2554                                        err);
2555                         goto soft_error;
2556
2557                 case ENOBUFS:
2558                 case ENOMEM:
2559                 case ECONNRESET:
2560                 case ECONNABORTED:
2561                 case EHOSTUNREACH:
2562                 case EHOSTDOWN:
2563                 case ENETUNREACH:
2564                 case ENETDOWN:
2565                 case ECONNREFUSED:
2566 #ifdef EPROTO
2567                 case EPROTO:
2568 #endif
2569 #ifdef ENONET
2570                 case ENONET:
2571 #endif
2572                         goto soft_error;
2573                 default:
2574                         break;
2575                 }
2576                 isc__strerror(errno, strbuf, sizeof(strbuf));
2577                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2578                                  "internal_accept: %s() %s: %s", err,
2579                                  isc_msgcat_get(isc_msgcat,
2580                                                 ISC_MSGSET_GENERAL,
2581                                                 ISC_MSG_FAILED,
2582                                                 "failed"),
2583                                  strbuf);
2584                 fd = -1;
2585                 result = ISC_R_UNEXPECTED;
2586         } else {
2587                 if (addrlen == 0U) {
2588                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2589                                          "internal_accept(): "
2590                                          "accept() failed to return "
2591                                          "remote address");
2592
2593                         (void)close(fd);
2594                         goto soft_error;
2595                 } else if (dev->newsocket->address.type.sa.sa_family !=
2596                            sock->pf)
2597                 {
2598                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2599                                          "internal_accept(): "
2600                                          "accept() returned peer address "
2601                                          "family %u (expected %u)",
2602                                          dev->newsocket->address.
2603                                          type.sa.sa_family,
2604                                          sock->pf);
2605                         (void)close(fd);
2606                         goto soft_error;
2607                 } else if (fd >= (int)manager->maxsocks) {
2608                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2609                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2610                                        isc_msgcat, ISC_MSGSET_SOCKET,
2611                                        ISC_MSG_TOOMANYFDS,
2612                                        "accept: "
2613                                        "file descriptor exceeds limit (%d/%u)",
2614                                        fd, manager->maxsocks);
2615                         (void)close(fd);
2616                         goto soft_error;
2617                 }
2618         }
2619
2620         if (fd != -1) {
2621                 dev->newsocket->address.length = addrlen;
2622                 dev->newsocket->pf = sock->pf;
2623         }
2624
2625         /*
2626          * Pull off the done event.
2627          */
2628         ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2629
2630         /*
2631          * Poke watcher if there are more pending accepts.
2632          */
2633         if (!ISC_LIST_EMPTY(sock->accept_list))
2634                 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2635
2636         UNLOCK(&sock->lock);
2637
2638         if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) {
2639                 (void)close(fd);
2640                 fd = -1;
2641                 result = ISC_R_UNEXPECTED;
2642         }
2643
2644         /*
2645          * -1 means the new socket didn't happen.
2646          */
2647         if (fd != -1) {
2648                 int lockid = FDLOCK_ID(fd);
2649
2650                 LOCK(&manager->fdlock[lockid]);
2651                 manager->fds[fd] = dev->newsocket;
2652                 manager->fdstate[fd] = MANAGED;
2653                 UNLOCK(&manager->fdlock[lockid]);
2654
2655                 LOCK(&manager->lock);
2656                 ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
2657
2658                 dev->newsocket->fd = fd;
2659                 dev->newsocket->bound = 1;
2660                 dev->newsocket->connected = 1;
2661
2662                 /*
2663                  * Save away the remote address
2664                  */
2665                 dev->address = dev->newsocket->address;
2666
2667 #ifdef USE_SELECT
2668                 if (manager->maxfd < fd)
2669                         manager->maxfd = fd;
2670 #endif
2671
2672                 socket_log(sock, &dev->newsocket->address, CREATION,
2673                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2674                            "accepted connection, new socket %p",
2675                            dev->newsocket);
2676
2677                 UNLOCK(&manager->lock);
2678         } else {
2679                 dev->newsocket->references--;
2680                 free_socket(&dev->newsocket);
2681         }
2682
2683         /*
2684          * Fill in the done event details and send it off.
2685          */
2686         dev->result = result;
2687         task = dev->ev_sender;
2688         dev->ev_sender = sock;
2689
2690         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2691         return;
2692
2693  soft_error:
2694         select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2695         UNLOCK(&sock->lock);
2696         return;
2697 }
2698
2699 static void
2700 internal_recv(isc_task_t *me, isc_event_t *ev) {
2701         isc_socketevent_t *dev;
2702         isc_socket_t *sock;
2703
2704         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
2705
2706         sock = ev->ev_sender;
2707         INSIST(VALID_SOCKET(sock));
2708
2709         LOCK(&sock->lock);
2710         socket_log(sock, NULL, IOEVENT,
2711                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2712                    "internal_recv: task %p got event %p", me, ev);
2713
2714         INSIST(sock->pending_recv == 1);
2715         sock->pending_recv = 0;
2716
2717         INSIST(sock->references > 0);
2718         sock->references--;  /* the internal event is done with this socket */
2719         if (sock->references == 0) {
2720                 UNLOCK(&sock->lock);
2721                 destroy(&sock);
2722                 return;
2723         }
2724
2725         /*
2726          * Try to do as much I/O as possible on this socket.  There are no
2727          * limits here, currently.
2728          */
2729         dev = ISC_LIST_HEAD(sock->recv_list);
2730         while (dev != NULL) {
2731                 switch (doio_recv(sock, dev)) {
2732                 case DOIO_SOFT:
2733                         goto poke;
2734
2735                 case DOIO_EOF:
2736                         /*
2737                          * read of 0 means the remote end was closed.
2738                          * Run through the event queue and dispatch all
2739                          * the events with an EOF result code.
2740                          */
2741                         do {
2742                                 dev->result = ISC_R_EOF;
2743                                 send_recvdone_event(sock, &dev);
2744                                 dev = ISC_LIST_HEAD(sock->recv_list);
2745                         } while (dev != NULL);
2746                         goto poke;
2747
2748                 case DOIO_SUCCESS:
2749                 case DOIO_HARD:
2750                         send_recvdone_event(sock, &dev);
2751                         break;
2752                 }
2753
2754                 dev = ISC_LIST_HEAD(sock->recv_list);
2755         }
2756
2757  poke:
2758         if (!ISC_LIST_EMPTY(sock->recv_list))
2759                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2760
2761         UNLOCK(&sock->lock);
2762 }
2763
2764 static void
2765 internal_send(isc_task_t *me, isc_event_t *ev) {
2766         isc_socketevent_t *dev;
2767         isc_socket_t *sock;
2768
2769         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2770
2771         /*
2772          * Find out what socket this is and lock it.
2773          */
2774         sock = (isc_socket_t *)ev->ev_sender;
2775         INSIST(VALID_SOCKET(sock));
2776
2777         LOCK(&sock->lock);
2778         socket_log(sock, NULL, IOEVENT,
2779                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2780                    "internal_send: task %p got event %p", me, ev);
2781
2782         INSIST(sock->pending_send == 1);
2783         sock->pending_send = 0;
2784
2785         INSIST(sock->references > 0);
2786         sock->references--;  /* the internal event is done with this socket */
2787         if (sock->references == 0) {
2788                 UNLOCK(&sock->lock);
2789                 destroy(&sock);
2790                 return;
2791         }
2792
2793         /*
2794          * Try to do as much I/O as possible on this socket.  There are no
2795          * limits here, currently.
2796          */
2797         dev = ISC_LIST_HEAD(sock->send_list);
2798         while (dev != NULL) {
2799                 switch (doio_send(sock, dev)) {
2800                 case DOIO_SOFT:
2801                         goto poke;
2802
2803                 case DOIO_HARD:
2804                 case DOIO_SUCCESS:
2805                         send_senddone_event(sock, &dev);
2806                         break;
2807                 }
2808
2809                 dev = ISC_LIST_HEAD(sock->send_list);
2810         }
2811
2812  poke:
2813         if (!ISC_LIST_EMPTY(sock->send_list))
2814                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2815
2816         UNLOCK(&sock->lock);
2817 }
2818
2819 /*
2820  * Process read/writes on each fd here.  Avoid locking
2821  * and unlocking twice if both reads and writes are possible.
2822  */
2823 static void
2824 process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable,
2825            isc_boolean_t writeable)
2826 {
2827         isc_socket_t *sock;
2828         isc_boolean_t unlock_sock;
2829         int lockid = FDLOCK_ID(fd);
2830
2831         /*
2832          * If the socket is going to be closed, don't do more I/O.
2833          */
2834         LOCK(&manager->fdlock[lockid]);
2835         if (manager->fdstate[fd] == CLOSE_PENDING) {
2836                 UNLOCK(&manager->fdlock[lockid]);
2837
2838                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
2839                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
2840                 return;
2841         }
2842
2843         sock = manager->fds[fd];
2844         UNLOCK(&manager->fdlock[lockid]);
2845         unlock_sock = ISC_FALSE;
2846         if (readable) {
2847                 if (sock == NULL) {
2848                         (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
2849                         goto check_write;
2850                 }
2851                 unlock_sock = ISC_TRUE;
2852                 LOCK(&sock->lock);
2853                 if (!SOCK_DEAD(sock)) {
2854                         if (sock->listener)
2855                                 dispatch_accept(sock);
2856                         else
2857                                 dispatch_recv(sock);
2858                 }
2859                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
2860         }
2861 check_write:
2862         if (writeable) {
2863                 if (sock == NULL) {
2864                         (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
2865                         return;
2866                 }
2867                 if (!unlock_sock) {
2868                         unlock_sock = ISC_TRUE;
2869                         LOCK(&sock->lock);
2870                 }
2871                 if (!SOCK_DEAD(sock)) {
2872                         if (sock->connecting)
2873                                 dispatch_connect(sock);
2874                         else
2875                                 dispatch_send(sock);
2876                 }
2877                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
2878         }
2879         if (unlock_sock)
2880                 UNLOCK(&sock->lock);
2881 }
2882
2883 #ifdef USE_KQUEUE
2884 static isc_boolean_t
2885 process_fds(isc_socketmgr_t *manager, struct kevent *events, int nevents) {
2886         int i;
2887         isc_boolean_t readable, writable;
2888         isc_boolean_t done = ISC_FALSE;
2889 #ifdef ISC_PLATFORM_USETHREADS
2890         isc_boolean_t have_ctlevent = ISC_FALSE;
2891 #endif
2892
2893         if (nevents == manager->nevents) {
2894                 /*
2895                  * This is not an error, but something unexpected.  If this
2896                  * happens, it may indicate the need for increasing
2897                  * ISC_SOCKET_MAXEVENTS.
2898                  */
2899                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
2900                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
2901                             "maximum number of FD events (%d) received",
2902                             nevents);
2903         }
2904
2905         for (i = 0; i < nevents; i++) {
2906                 REQUIRE(events[i].ident < manager->maxsocks);
2907 #ifdef ISC_PLATFORM_USETHREADS
2908                 if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) {
2909                         have_ctlevent = ISC_TRUE;
2910                         continue;
2911                 }
2912 #endif
2913                 readable = ISC_TF(events[i].filter == EVFILT_READ);
2914                 writable = ISC_TF(events[i].filter == EVFILT_WRITE);
2915                 process_fd(manager, events[i].ident, readable, writable);
2916         }
2917
2918 #ifdef ISC_PLATFORM_USETHREADS
2919         if (have_ctlevent)
2920                 done = process_ctlfd(manager);
2921 #endif
2922
2923         return (done);
2924 }
2925 #elif defined(USE_EPOLL)
2926 static isc_boolean_t
2927 process_fds(isc_socketmgr_t *manager, struct epoll_event *events, int nevents) {
2928         int i;
2929         isc_boolean_t done = ISC_FALSE;
2930 #ifdef ISC_PLATFORM_USETHREADS
2931         isc_boolean_t have_ctlevent = ISC_FALSE;
2932 #endif
2933
2934         if (nevents == manager->nevents) {
2935                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
2936                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
2937                             "maximum number of FD events (%d) received",
2938                             nevents);
2939         }
2940
2941         for (i = 0; i < nevents; i++) {
2942                 REQUIRE(events[i].data.fd < (int)manager->maxsocks);
2943 #ifdef ISC_PLATFORM_USETHREADS
2944                 if (events[i].data.fd == manager->pipe_fds[0]) {
2945                         have_ctlevent = ISC_TRUE;
2946                         continue;
2947                 }
2948 #endif
2949                 if ((events[i].events & EPOLLERR) != 0 ||
2950                     (events[i].events & EPOLLHUP) != 0) {
2951                         /*
2952                          * epoll does not set IN/OUT bits on an erroneous
2953                          * condition, so we need to try both anyway.  This is a
2954                          * bit inefficient, but should be okay for such rare
2955                          * events.  Note also that the read or write attempt
2956                          * won't block because we use non-blocking sockets.
2957                          */
2958                         events[i].events |= (EPOLLIN | EPOLLOUT);
2959                 }
2960                 process_fd(manager, events[i].data.fd,
2961                            (events[i].events & EPOLLIN) != 0,
2962                            (events[i].events & EPOLLOUT) != 0);
2963         }
2964
2965 #ifdef ISC_PLATFORM_USETHREADS
2966         if (have_ctlevent)
2967                 done = process_ctlfd(manager);
2968 #endif
2969
2970         return (done);
2971 }
2972 #elif defined(USE_DEVPOLL)
2973 static isc_boolean_t
2974 process_fds(isc_socketmgr_t *manager, struct pollfd *events, int nevents) {
2975         int i;
2976         isc_boolean_t done = ISC_FALSE;
2977 #ifdef ISC_PLATFORM_USETHREADS
2978         isc_boolean_t have_ctlevent = ISC_FALSE;
2979 #endif
2980
2981         if (nevents == manager->nevents) {
2982                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
2983                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
2984                             "maximum number of FD events (%d) received",
2985                             nevents);
2986         }
2987
2988         for (i = 0; i < nevents; i++) {
2989                 REQUIRE(events[i].fd < (int)manager->maxsocks);
2990 #ifdef ISC_PLATFORM_USETHREADS
2991                 if (events[i].fd == manager->pipe_fds[0]) {
2992                         have_ctlevent = ISC_TRUE;
2993                         continue;
2994                 }
2995 #endif
2996                 process_fd(manager, events[i].fd,
2997                            (events[i].events & POLLIN) != 0,
2998                            (events[i].events & POLLOUT) != 0);
2999         }
3000
3001 #ifdef ISC_PLATFORM_USETHREADS
3002         if (have_ctlevent)
3003                 done = process_ctlfd(manager);
3004 #endif
3005
3006         return (done);
3007 }
3008 #elif defined(USE_SELECT)
3009 static void
3010 process_fds(isc_socketmgr_t *manager, int maxfd,
3011             fd_set *readfds, fd_set *writefds)
3012 {
3013         int i;
3014
3015         REQUIRE(maxfd <= (int)manager->maxsocks);
3016
3017         for (i = 0; i < maxfd; i++) {
3018 #ifdef ISC_PLATFORM_USETHREADS
3019                 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
3020                         continue;
3021 #endif /* ISC_PLATFORM_USETHREADS */
3022                 process_fd(manager, i, FD_ISSET(i, readfds),
3023                            FD_ISSET(i, writefds));
3024         }
3025 }
3026 #endif
3027
3028 #ifdef ISC_PLATFORM_USETHREADS
3029 static isc_boolean_t
3030 process_ctlfd(isc_socketmgr_t *manager) {
3031         int msg, fd;
3032
3033         for (;;) {
3034                 select_readmsg(manager, &fd, &msg);
3035
3036                 manager_log(manager, IOEVENT,
3037                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
3038                                            ISC_MSG_WATCHERMSG,
3039                                            "watcher got message %d "
3040                                            "for socket %d"), msg, fd);
3041
3042                 /*
3043                  * Nothing to read?
3044                  */
3045                 if (msg == SELECT_POKE_NOTHING)
3046                         break;
3047
3048                 /*
3049                  * Handle shutdown message.  We really should
3050                  * jump out of this loop right away, but
3051                  * it doesn't matter if we have to do a little
3052                  * more work first.
3053                  */
3054                 if (msg == SELECT_POKE_SHUTDOWN)
3055                         return (ISC_TRUE);
3056
3057                 /*
3058                  * This is a wakeup on a socket.  Look
3059                  * at the event queue for both read and write,
3060                  * and decide if we need to watch on it now
3061                  * or not.
3062                  */
3063                 wakeup_socket(manager, fd, msg);
3064         }
3065
3066         return (ISC_FALSE);
3067 }
3068
3069 /*
3070  * This is the thread that will loop forever, always in a select or poll
3071  * call.
3072  *
3073  * When select returns something to do, track down what thread gets to do
3074  * this I/O and post the event to it.
3075  */
3076 static isc_threadresult_t
3077 watcher(void *uap) {
3078         isc_socketmgr_t *manager = uap;
3079         isc_boolean_t done;
3080         int ctlfd;
3081         int cc;
3082 #ifdef USE_KQUEUE
3083         const char *fnname = "kevent()";
3084 #elif defined (USE_EPOLL)
3085         const char *fnname = "epoll_wait()";
3086 #elif defined(USE_DEVPOLL)
3087         const char *fnname = "ioctl(DP_POLL)";
3088         struct dvpoll dvp;
3089 #elif defined (USE_SELECT)
3090         const char *fnname = "select()";
3091         int maxfd;
3092 #endif
3093         char strbuf[ISC_STRERRORSIZE];
3094 #ifdef ISC_SOCKET_USE_POLLWATCH
3095         pollstate_t pollstate = poll_idle;
3096 #endif
3097
3098         /*
3099          * Get the control fd here.  This will never change.
3100          */
3101         ctlfd = manager->pipe_fds[0];
3102         done = ISC_FALSE;
3103         while (!done) {
3104                 do {
3105 #ifdef USE_KQUEUE
3106                         cc = kevent(manager->kqueue_fd, NULL, 0,
3107                                     manager->events, manager->nevents, NULL);
3108 #elif defined(USE_EPOLL)
3109                         cc = epoll_wait(manager->epoll_fd, manager->events,
3110                                         manager->nevents, -1);
3111 #elif defined(USE_DEVPOLL)
3112                         dvp.dp_fds = manager->events;
3113                         dvp.dp_nfds = manager->nevents;
3114 #ifndef ISC_SOCKET_USE_POLLWATCH
3115                         dvp.dp_timeout = -1;
3116 #else
3117                         if (pollstate == poll_idle)
3118                                 dvp.dp_timeout = -1;
3119                         else
3120                                 dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT;
3121 #endif  /* ISC_SOCKET_USE_POLLWATCH */
3122                         cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
3123 #elif defined(USE_SELECT)
3124                         LOCK(&manager->lock);
3125                         memcpy(manager->read_fds_copy, manager->read_fds,
3126                                manager->fd_bufsize);
3127                         memcpy(manager->write_fds_copy, manager->write_fds,
3128                                manager->fd_bufsize);
3129                         maxfd = manager->maxfd + 1;
3130                         UNLOCK(&manager->lock);
3131
3132                         cc = select(maxfd, manager->read_fds_copy,
3133                                     manager->write_fds_copy, NULL, NULL);
3134 #endif  /* USE_KQUEUE */
3135
3136                         if (cc < 0 && !SOFT_ERROR(errno)) {
3137                                 isc__strerror(errno, strbuf, sizeof(strbuf));
3138                                 FATAL_ERROR(__FILE__, __LINE__,
3139                                             "%s %s: %s", fnname,
3140                                             isc_msgcat_get(isc_msgcat,
3141                                                            ISC_MSGSET_GENERAL,
3142                                                            ISC_MSG_FAILED,
3143                                                            "failed"), strbuf);
3144                         }
3145
3146 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3147                         if (cc == 0) {
3148                                 if (pollstate == poll_active)
3149                                         pollstate = poll_checking;
3150                                 else if (pollstate == poll_checking)
3151                                         pollstate = poll_idle;
3152                         } else if (cc > 0) {
3153                                 if (pollstate == poll_checking) {
3154                                         /*
3155                                          * XXX: We'd like to use a more
3156                                          * verbose log level as it's actually an
3157                                          * unexpected event, but the kernel bug
3158                                          * reportedly happens pretty frequently
3159                                          * (and it can also be a false positive)
3160                                          * so it would be just too noisy.
3161                                          */
3162                                         manager_log(manager,
3163                                                     ISC_LOGCATEGORY_GENERAL,
3164                                                     ISC_LOGMODULE_SOCKET,
3165                                                     ISC_LOG_DEBUG(1),
3166                                                     "unexpected POLL timeout");
3167                                 }
3168                                 pollstate = poll_active;
3169                         }
3170 #endif
3171                 } while (cc < 0);
3172
3173 #if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL)
3174                 done = process_fds(manager, manager->events, cc);
3175 #elif defined(USE_SELECT)
3176                 process_fds(manager, maxfd, manager->read_fds_copy,
3177                             manager->write_fds_copy);
3178
3179                 /*
3180                  * Process reads on internal, control fd.
3181                  */
3182                 if (FD_ISSET(ctlfd, manager->read_fds_copy))
3183                         done = process_ctlfd(manager);
3184 #endif
3185         }
3186
3187         manager_log(manager, TRACE,
3188                     isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3189                                    ISC_MSG_EXITING, "watcher exiting"));
3190
3191         return ((isc_threadresult_t)0);
3192 }
3193 #endif /* ISC_PLATFORM_USETHREADS */
3194
3195 void
3196 isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
3197
3198         REQUIRE(VALID_MANAGER(manager));
3199
3200         manager->reserved = reserved;
3201 }
3202
3203 /*
3204  * Create a new socket manager.
3205  */
3206
3207 static isc_result_t
3208 setup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
3209         isc_result_t result;
3210
3211 #ifdef USE_KQUEUE
3212         manager->nevents = ISC_SOCKET_MAXEVENTS;
3213         manager->events = isc_mem_get(mctx, sizeof(struct kevent) *
3214                                       manager->nevents);
3215         if (manager->events == NULL)
3216                 return (ISC_R_NOMEMORY);
3217         manager->kqueue_fd = kqueue();
3218         if (manager->kqueue_fd == -1) {
3219                 result = isc__errno2result(errno);
3220                 isc_mem_put(mctx, manager->events,
3221                             sizeof(struct kevent) * manager->nevents);
3222                 return (result);
3223         }
3224
3225 #ifdef ISC_PLATFORM_USETHREADS
3226         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3227         if (result != ISC_R_SUCCESS) {
3228                 close(manager->kqueue_fd);
3229                 isc_mem_put(mctx, manager->events,
3230                             sizeof(struct kevent) * manager->nevents);
3231                 return (result);
3232         }
3233 #endif  /* ISC_PLATFORM_USETHREADS */
3234 #elif defined(USE_EPOLL)
3235         manager->nevents = ISC_SOCKET_MAXEVENTS;
3236         manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) *
3237                                       manager->nevents);
3238         if (manager->events == NULL)
3239                 return (ISC_R_NOMEMORY);
3240         manager->epoll_fd = epoll_create(manager->nevents);
3241         if (manager->epoll_fd == -1) {
3242                 result = isc__errno2result(errno);
3243                 isc_mem_put(mctx, manager->events,
3244                             sizeof(struct epoll_event) * manager->nevents);
3245                 return (result);
3246         }
3247 #ifdef ISC_PLATFORM_USETHREADS
3248         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3249         if (result != ISC_R_SUCCESS) {
3250                 close(manager->epoll_fd);
3251                 isc_mem_put(mctx, manager->events,
3252                             sizeof(struct epoll_event) * manager->nevents);
3253                 return (result);
3254         }
3255 #endif  /* ISC_PLATFORM_USETHREADS */
3256 #elif defined(USE_DEVPOLL)
3257         /*
3258          * XXXJT: /dev/poll seems to reject large numbers of events,
3259          * so we should be careful about redefining ISC_SOCKET_MAXEVENTS.
3260          */
3261         manager->nevents = ISC_SOCKET_MAXEVENTS;
3262         manager->events = isc_mem_get(mctx, sizeof(struct pollfd) *
3263                                       manager->nevents);
3264         if (manager->events == NULL)
3265                 return (ISC_R_NOMEMORY);
3266         /*
3267          * Note: fdpollinfo should be able to support all possible FDs, so
3268          * it must have maxsocks entries (not nevents).
3269          */
3270         manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) *
3271                                           manager->maxsocks);
3272         if (manager->fdpollinfo == NULL) {
3273                 isc_mem_put(mctx, manager->events,
3274                             sizeof(pollinfo_t) * manager->maxsocks);
3275                 return (ISC_R_NOMEMORY);
3276         }
3277         memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks);
3278         manager->devpoll_fd = open("/dev/poll", O_RDWR);
3279         if (manager->devpoll_fd == -1) {
3280                 result = isc__errno2result(errno);
3281                 isc_mem_put(mctx, manager->events,
3282                             sizeof(struct pollfd) * manager->nevents);
3283                 isc_mem_put(mctx, manager->fdpollinfo,
3284                             sizeof(pollinfo_t) * manager->maxsocks);
3285                 return (result);
3286         }
3287 #ifdef ISC_PLATFORM_USETHREADS
3288         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3289         if (result != ISC_R_SUCCESS) {
3290                 close(manager->devpoll_fd);
3291                 isc_mem_put(mctx, manager->events,
3292                             sizeof(struct pollfd) * manager->nevents);
3293                 isc_mem_put(mctx, manager->fdpollinfo,
3294                             sizeof(pollinfo_t) * manager->maxsocks);
3295                 return (result);
3296         }
3297 #endif  /* ISC_PLATFORM_USETHREADS */
3298 #elif defined(USE_SELECT)
3299         UNUSED(result);
3300
3301 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3302         /*
3303          * Note: this code should also cover the case of MAXSOCKETS <=
3304          * FD_SETSIZE, but we separate the cases to avoid possible portability
3305          * issues regarding howmany() and the actual representation of fd_set.
3306          */
3307         manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
3308                 sizeof(fd_mask);
3309 #else
3310         manager->fd_bufsize = sizeof(fd_set);
3311 #endif
3312
3313         manager->read_fds = NULL;
3314         manager->read_fds_copy = NULL;
3315         manager->write_fds = NULL;
3316         manager->write_fds_copy = NULL;
3317
3318         manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
3319         if (manager->read_fds != NULL)
3320                 manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
3321         if (manager->read_fds_copy != NULL)
3322                 manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
3323         if (manager->write_fds != NULL) {
3324                 manager->write_fds_copy = isc_mem_get(mctx,
3325                                                       manager->fd_bufsize);
3326         }
3327         if (manager->write_fds_copy == NULL) {
3328                 if (manager->write_fds != NULL) {
3329                         isc_mem_put(mctx, manager->write_fds,
3330                                     manager->fd_bufsize);
3331                 }
3332                 if (manager->read_fds_copy != NULL) {
3333                         isc_mem_put(mctx, manager->read_fds_copy,
3334                                     manager->fd_bufsize);
3335                 }
3336                 if (manager->read_fds != NULL) {
3337                         isc_mem_put(mctx, manager->read_fds,
3338                                     manager->fd_bufsize);
3339                 }
3340                 return (ISC_R_NOMEMORY);
3341         }
3342         memset(manager->read_fds, 0, manager->fd_bufsize);
3343         memset(manager->write_fds, 0, manager->fd_bufsize);
3344
3345 #ifdef ISC_PLATFORM_USETHREADS
3346         (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3347         manager->maxfd = manager->pipe_fds[0];
3348 #else /* ISC_PLATFORM_USETHREADS */
3349         manager->maxfd = 0;
3350 #endif /* ISC_PLATFORM_USETHREADS */
3351 #endif  /* USE_KQUEUE */
3352
3353         return (ISC_R_SUCCESS);
3354 }
3355
3356 static void
3357 cleanup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
3358 #ifdef ISC_PLATFORM_USETHREADS
3359         isc_result_t result;
3360
3361         result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3362         if (result != ISC_R_SUCCESS) {
3363                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3364                                  "epoll_ctl(DEL) %s",
3365                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3366                                                 ISC_MSG_FAILED, "failed"));
3367         }
3368 #endif  /* ISC_PLATFORM_USETHREADS */
3369
3370 #ifdef USE_KQUEUE
3371         close(manager->kqueue_fd);
3372         isc_mem_put(mctx, manager->events,
3373                     sizeof(struct kevent) * manager->nevents);
3374 #elif defined(USE_EPOLL)
3375         close(manager->epoll_fd);
3376         isc_mem_put(mctx, manager->events,
3377                     sizeof(struct epoll_event) * manager->nevents);
3378 #elif defined(USE_DEVPOLL)
3379         close(manager->devpoll_fd);
3380         isc_mem_put(mctx, manager->events,
3381                     sizeof(struct pollfd) * manager->nevents);
3382         isc_mem_put(mctx, manager->fdpollinfo,
3383                     sizeof(pollinfo_t) * manager->maxsocks);
3384 #elif defined(USE_SELECT)
3385         if (manager->read_fds != NULL)
3386                 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
3387         if (manager->read_fds_copy != NULL)
3388                 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize);
3389         if (manager->write_fds != NULL)
3390                 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
3391         if (manager->write_fds_copy != NULL)
3392                 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
3393 #endif  /* USE_KQUEUE */
3394 }
3395
3396 isc_result_t
3397 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
3398         return (isc_socketmgr_create2(mctx, managerp, 0));
3399 }
3400
3401 isc_result_t
3402 isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
3403                       unsigned int maxsocks)
3404 {
3405         int i;
3406         isc_socketmgr_t *manager;
3407 #ifdef ISC_PLATFORM_USETHREADS
3408         char strbuf[ISC_STRERRORSIZE];
3409 #endif
3410         isc_result_t result;
3411
3412         REQUIRE(managerp != NULL && *managerp == NULL);
3413
3414 #ifndef ISC_PLATFORM_USETHREADS
3415         if (socketmgr != NULL) {
3416                 /* Don't allow maxsocks to be updated */
3417                 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
3418                         return (ISC_R_EXISTS);
3419
3420                 socketmgr->refs++;
3421                 *managerp = socketmgr;
3422                 return (ISC_R_SUCCESS);
3423         }
3424 #endif /* ISC_PLATFORM_USETHREADS */
3425
3426         if (maxsocks == 0)
3427                 maxsocks = ISC_SOCKET_MAXSOCKETS;
3428
3429         manager = isc_mem_get(mctx, sizeof(*manager));
3430         if (manager == NULL)
3431                 return (ISC_R_NOMEMORY);
3432
3433         /* zero-clear so that necessary cleanup on failure will be easy */
3434         memset(manager, 0, sizeof(*manager));
3435         manager->maxsocks = maxsocks;
3436         manager->reserved = 0;
3437         manager->fds = isc_mem_get(mctx,
3438                                    manager->maxsocks * sizeof(isc_socket_t *));
3439         if (manager->fds == NULL) {
3440                 result = ISC_R_NOMEMORY;
3441                 goto free_manager;
3442         }
3443         manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int));
3444         if (manager->fds == NULL) {
3445                 result = ISC_R_NOMEMORY;
3446                 goto free_manager;
3447         }
3448
3449         manager->magic = SOCKET_MANAGER_MAGIC;
3450         manager->mctx = NULL;
3451         memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
3452         ISC_LIST_INIT(manager->socklist);
3453         result = isc_mutex_init(&manager->lock);
3454         if (result != ISC_R_SUCCESS)
3455                 goto free_manager;
3456         manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t));
3457         if (manager->fdlock == NULL) {
3458                 result = ISC_R_NOMEMORY;
3459                 goto cleanup_lock;
3460         }
3461         for (i = 0; i < FDLOCK_COUNT; i++) {
3462                 result = isc_mutex_init(&manager->fdlock[i]);
3463                 if (result != ISC_R_SUCCESS) {
3464                         while (--i >= 0)
3465                                 DESTROYLOCK(&manager->fdlock[i]);
3466                         isc_mem_put(mctx, manager->fdlock,
3467                                     FDLOCK_COUNT * sizeof(isc_mutex_t));
3468                         manager->fdlock = NULL;
3469                         goto cleanup_lock;
3470                 }
3471         }
3472
3473 #ifdef ISC_PLATFORM_USETHREADS
3474         if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
3475                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3476                                  "isc_condition_init() %s",
3477                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3478                                                 ISC_MSG_FAILED, "failed"));
3479                 result = ISC_R_UNEXPECTED;
3480                 goto cleanup_lock;
3481         }
3482
3483         /*
3484          * Create the special fds that will be used to wake up the
3485          * select/poll loop when something internal needs to be done.
3486          */
3487         if (pipe(manager->pipe_fds) != 0) {
3488                 isc__strerror(errno, strbuf, sizeof(strbuf));
3489                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3490                                  "pipe() %s: %s",
3491                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3492                                                 ISC_MSG_FAILED, "failed"),
3493                                  strbuf);
3494                 result = ISC_R_UNEXPECTED;
3495                 goto cleanup_condition;
3496         }
3497
3498         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
3499 #if 0
3500         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
3501 #endif
3502 #else /* ISC_PLATFORM_USETHREADS */
3503         manager->refs = 1;
3504 #endif /* ISC_PLATFORM_USETHREADS */
3505
3506         /*
3507          * Set up initial state for the select loop
3508          */
3509         result = setup_watcher(mctx, manager);
3510         if (result != ISC_R_SUCCESS)
3511                 goto cleanup;
3512         memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
3513 #ifdef ISC_PLATFORM_USETHREADS
3514         /*
3515          * Start up the select/poll thread.
3516          */
3517         if (isc_thread_create(watcher, manager, &manager->watcher) !=
3518             ISC_R_SUCCESS) {
3519                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3520                                  "isc_thread_create() %s",
3521                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3522                                                 ISC_MSG_FAILED, "failed"));
3523                 cleanup_watcher(mctx, manager);
3524                 result = ISC_R_UNEXPECTED;
3525                 goto cleanup;
3526         }
3527 #endif /* ISC_PLATFORM_USETHREADS */
3528         isc_mem_attach(mctx, &manager->mctx);
3529
3530 #ifndef ISC_PLATFORM_USETHREADS
3531         socketmgr = manager;
3532 #endif /* ISC_PLATFORM_USETHREADS */
3533         *managerp = manager;
3534
3535         return (ISC_R_SUCCESS);
3536
3537 cleanup:
3538 #ifdef ISC_PLATFORM_USETHREADS
3539         (void)close(manager->pipe_fds[0]);
3540         (void)close(manager->pipe_fds[1]);
3541 #endif  /* ISC_PLATFORM_USETHREADS */
3542
3543 #ifdef ISC_PLATFORM_USETHREADS
3544 cleanup_condition:
3545         (void)isc_condition_destroy(&manager->shutdown_ok);
3546 #endif  /* ISC_PLATFORM_USETHREADS */
3547
3548
3549 cleanup_lock:
3550         if (manager->fdlock != NULL) {
3551                 for (i = 0; i < FDLOCK_COUNT; i++)
3552                         DESTROYLOCK(&manager->fdlock[i]);
3553         }
3554         DESTROYLOCK(&manager->lock);
3555
3556 free_manager:
3557         if (manager->fdlock != NULL) {
3558                 isc_mem_put(mctx, manager->fdlock,
3559                             FDLOCK_COUNT * sizeof(isc_mutex_t));
3560         }
3561         if (manager->fdstate != NULL) {
3562                 isc_mem_put(mctx, manager->fdstate,
3563                             manager->maxsocks * sizeof(int));
3564         }
3565         if (manager->fds != NULL) {
3566                 isc_mem_put(mctx, manager->fds,
3567                             manager->maxsocks * sizeof(isc_socket_t *));
3568         }
3569         isc_mem_put(mctx, manager, sizeof(*manager));
3570
3571         return (result);
3572 }
3573
3574 isc_result_t
3575 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
3576         REQUIRE(VALID_MANAGER(manager));
3577         REQUIRE(nsockp != NULL);
3578
3579         *nsockp = manager->maxsocks;
3580
3581         return (ISC_R_SUCCESS);
3582 }
3583
3584 void
3585 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
3586         isc_socketmgr_t *manager;
3587         int i;
3588         isc_mem_t *mctx;
3589
3590         /*
3591          * Destroy a socket manager.
3592          */
3593
3594         REQUIRE(managerp != NULL);
3595         manager = *managerp;
3596         REQUIRE(VALID_MANAGER(manager));
3597
3598 #ifndef ISC_PLATFORM_USETHREADS
3599         if (manager->refs > 1) {
3600                 manager->refs--;
3601                 *managerp = NULL;
3602                 return;
3603         }
3604 #endif /* ISC_PLATFORM_USETHREADS */
3605
3606         LOCK(&manager->lock);
3607
3608 #ifdef ISC_PLATFORM_USETHREADS
3609         /*
3610          * Wait for all sockets to be destroyed.
3611          */
3612         while (!ISC_LIST_EMPTY(manager->socklist)) {
3613                 manager_log(manager, CREATION,
3614                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
3615                                            ISC_MSG_SOCKETSREMAIN,
3616                                            "sockets exist"));
3617                 WAIT(&manager->shutdown_ok, &manager->lock);
3618         }
3619 #else /* ISC_PLATFORM_USETHREADS */
3620         /*
3621          * Hope all sockets have been destroyed.
3622          */
3623         if (!ISC_LIST_EMPTY(manager->socklist)) {
3624                 manager_log(manager, CREATION,
3625                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
3626                                            ISC_MSG_SOCKETSREMAIN,
3627                                            "sockets exist"));
3628                 INSIST(0);
3629         }
3630 #endif /* ISC_PLATFORM_USETHREADS */
3631
3632         UNLOCK(&manager->lock);
3633
3634         /*
3635          * Here, poke our select/poll thread.  Do this by closing the write
3636          * half of the pipe, which will send EOF to the read half.
3637          * This is currently a no-op in the non-threaded case.
3638          */
3639         select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
3640
3641 #ifdef ISC_PLATFORM_USETHREADS
3642         /*
3643          * Wait for thread to exit.
3644          */
3645         if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
3646                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3647                                  "isc_thread_join() %s",
3648                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3649                                                 ISC_MSG_FAILED, "failed"));
3650 #endif /* ISC_PLATFORM_USETHREADS */
3651
3652         /*
3653          * Clean up.
3654          */
3655         cleanup_watcher(manager->mctx, manager);
3656
3657 #ifdef ISC_PLATFORM_USETHREADS
3658         (void)close(manager->pipe_fds[0]);
3659         (void)close(manager->pipe_fds[1]);
3660         (void)isc_condition_destroy(&manager->shutdown_ok);
3661 #endif /* ISC_PLATFORM_USETHREADS */
3662
3663         for (i = 0; i < (int)manager->maxsocks; i++)
3664                 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
3665                         (void)close(i);
3666
3667         isc_mem_put(manager->mctx, manager->fds,
3668                     manager->maxsocks * sizeof(isc_socket_t *));
3669         isc_mem_put(manager->mctx, manager->fdstate,
3670                     manager->maxsocks * sizeof(int));
3671
3672         if (manager->fdlock != NULL) {
3673                 for (i = 0; i < FDLOCK_COUNT; i++)
3674                         DESTROYLOCK(&manager->fdlock[i]);
3675                 isc_mem_put(manager->mctx, manager->fdlock,
3676                             FDLOCK_COUNT * sizeof(isc_mutex_t));
3677         }
3678         DESTROYLOCK(&manager->lock);
3679         manager->magic = 0;
3680         mctx= manager->mctx;
3681         isc_mem_put(mctx, manager, sizeof(*manager));
3682
3683         isc_mem_detach(&mctx);
3684
3685         *managerp = NULL;
3686 }
3687
3688 static isc_result_t
3689 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
3690             unsigned int flags)
3691 {
3692         int io_state;
3693         isc_boolean_t have_lock = ISC_FALSE;
3694         isc_task_t *ntask = NULL;
3695         isc_result_t result = ISC_R_SUCCESS;
3696
3697         dev->ev_sender = task;
3698
3699         if (sock->type == isc_sockettype_udp) {
3700                 io_state = doio_recv(sock, dev);
3701         } else {
3702                 LOCK(&sock->lock);
3703                 have_lock = ISC_TRUE;
3704
3705                 if (ISC_LIST_EMPTY(sock->recv_list))
3706                         io_state = doio_recv(sock, dev);
3707                 else
3708                         io_state = DOIO_SOFT;
3709         }
3710
3711         switch (io_state) {
3712         case DOIO_SOFT:
3713                 /*
3714                  * We couldn't read all or part of the request right now, so
3715                  * queue it.
3716                  *
3717                  * Attach to socket and to task
3718                  */
3719                 isc_task_attach(task, &ntask);
3720                 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
3721
3722                 if (!have_lock) {
3723                         LOCK(&sock->lock);
3724                         have_lock = ISC_TRUE;
3725                 }
3726
3727                 /*
3728                  * Enqueue the request.  If the socket was previously not being
3729                  * watched, poke the watcher to start paying attention to it.
3730                  */
3731                 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
3732                         select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3733                 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
3734
3735                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
3736                            "socket_recv: event %p -> task %p",
3737                            dev, ntask);
3738
3739                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
3740                         result = ISC_R_INPROGRESS;
3741                 break;
3742
3743         case DOIO_EOF:
3744                 dev->result = ISC_R_EOF;
3745                 /* fallthrough */
3746
3747         case DOIO_HARD:
3748         case DOIO_SUCCESS:
3749                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
3750                         send_recvdone_event(sock, &dev);
3751                 break;
3752         }
3753
3754         if (have_lock)
3755                 UNLOCK(&sock->lock);
3756
3757         return (result);
3758 }
3759
3760 isc_result_t
3761 isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
3762                  unsigned int minimum, isc_task_t *task,
3763                  isc_taskaction_t action, const void *arg)
3764 {
3765         isc_socketevent_t *dev;
3766         isc_socketmgr_t *manager;
3767         unsigned int iocount;
3768         isc_buffer_t *buffer;
3769
3770         REQUIRE(VALID_SOCKET(sock));
3771         REQUIRE(buflist != NULL);
3772         REQUIRE(!ISC_LIST_EMPTY(*buflist));
3773         REQUIRE(task != NULL);
3774         REQUIRE(action != NULL);
3775
3776         manager = sock->manager;
3777         REQUIRE(VALID_MANAGER(manager));
3778
3779         iocount = isc_bufferlist_availablecount(buflist);
3780         REQUIRE(iocount > 0);
3781
3782         INSIST(sock->bound);
3783
3784         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
3785         if (dev == NULL) {
3786                 return (ISC_R_NOMEMORY);
3787         }
3788
3789         /*
3790          * UDP sockets are always partial read
3791          */
3792         if (sock->type == isc_sockettype_udp)
3793                 dev->minimum = 1;
3794         else {
3795                 if (minimum == 0)
3796                         dev->minimum = iocount;
3797                 else
3798                         dev->minimum = minimum;
3799         }
3800
3801         /*
3802          * Move each buffer from the passed in list to our internal one.
3803          */
3804         buffer = ISC_LIST_HEAD(*buflist);
3805         while (buffer != NULL) {
3806                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
3807                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
3808                 buffer = ISC_LIST_HEAD(*buflist);
3809         }
3810
3811         return (socket_recv(sock, dev, task, 0));
3812 }
3813
3814 isc_result_t
3815 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
3816                 isc_task_t *task, isc_taskaction_t action, const void *arg)
3817 {
3818         isc_socketevent_t *dev;
3819         isc_socketmgr_t *manager;
3820
3821         REQUIRE(VALID_SOCKET(sock));
3822         REQUIRE(action != NULL);
3823
3824         manager = sock->manager;
3825         REQUIRE(VALID_MANAGER(manager));
3826
3827         INSIST(sock->bound);
3828
3829         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
3830         if (dev == NULL)
3831                 return (ISC_R_NOMEMORY);
3832
3833         return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
3834 }
3835
3836 isc_result_t
3837 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
3838                  unsigned int minimum, isc_task_t *task,
3839                  isc_socketevent_t *event, unsigned int flags)
3840 {
3841         event->ev_sender = sock;
3842         event->result = ISC_R_UNEXPECTED;
3843         ISC_LIST_INIT(event->bufferlist);
3844         event->region = *region;
3845         event->n = 0;
3846         event->offset = 0;
3847         event->attributes = 0;
3848
3849         /*
3850          * UDP sockets are always partial read.
3851          */
3852         if (sock->type == isc_sockettype_udp)
3853                 event->minimum = 1;
3854         else {
3855                 if (minimum == 0)
3856                         event->minimum = region->length;
3857                 else
3858                         event->minimum = minimum;
3859         }
3860
3861         return (socket_recv(sock, event, task, flags));
3862 }
3863
3864 static isc_result_t
3865 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
3866             isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
3867             unsigned int flags)
3868 {
3869         int io_state;
3870         isc_boolean_t have_lock = ISC_FALSE;
3871         isc_task_t *ntask = NULL;
3872         isc_result_t result = ISC_R_SUCCESS;
3873
3874         dev->ev_sender = task;
3875
3876         set_dev_address(address, sock, dev);
3877         if (pktinfo != NULL) {
3878                 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
3879                 dev->pktinfo = *pktinfo;
3880
3881                 if (!isc_sockaddr_issitelocal(&dev->address) &&
3882                     !isc_sockaddr_islinklocal(&dev->address)) {
3883                         socket_log(sock, NULL, TRACE, isc_msgcat,
3884                                    ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
3885                                    "pktinfo structure provided, ifindex %u "
3886                                    "(set to 0)", pktinfo->ipi6_ifindex);
3887
3888                         /*
3889                          * Set the pktinfo index to 0 here, to let the
3890                          * kernel decide what interface it should send on.
3891                          */
3892                         dev->pktinfo.ipi6_ifindex = 0;
3893                 }
3894         }
3895
3896         if (sock->type == isc_sockettype_udp)
3897                 io_state = doio_send(sock, dev);
3898         else {
3899                 LOCK(&sock->lock);
3900                 have_lock = ISC_TRUE;
3901
3902                 if (ISC_LIST_EMPTY(sock->send_list))
3903                         io_state = doio_send(sock, dev);
3904                 else
3905                         io_state = DOIO_SOFT;
3906         }
3907
3908         switch (io_state) {
3909         case DOIO_SOFT:
3910                 /*
3911                  * We couldn't send all or part of the request right now, so
3912                  * queue it unless ISC_SOCKFLAG_NORETRY is set.
3913                  */
3914                 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
3915                         isc_task_attach(task, &ntask);
3916                         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
3917
3918                         if (!have_lock) {
3919                                 LOCK(&sock->lock);
3920                                 have_lock = ISC_TRUE;
3921                         }
3922
3923                         /*
3924                          * Enqueue the request.  If the socket was previously
3925                          * not being watched, poke the watcher to start
3926                          * paying attention to it.
3927                          */
3928                         if (ISC_LIST_EMPTY(sock->send_list) &&
3929                             !sock->pending_send)
3930                                 select_poke(sock->manager, sock->fd,
3931                                             SELECT_POKE_WRITE);
3932                         ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
3933
3934                         socket_log(sock, NULL, EVENT, NULL, 0, 0,
3935                                    "socket_send: event %p -> task %p",
3936                                    dev, ntask);
3937
3938                         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
3939                                 result = ISC_R_INPROGRESS;
3940                         break;
3941                 }
3942
3943         case DOIO_HARD:
3944         case DOIO_SUCCESS:
3945                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
3946                         send_senddone_event(sock, &dev);
3947                 break;
3948         }
3949
3950         if (have_lock)
3951                 UNLOCK(&sock->lock);
3952
3953         return (result);
3954 }
3955
3956 isc_result_t
3957 isc_socket_send(isc_socket_t *sock, isc_region_t *region,
3958                 isc_task_t *task, isc_taskaction_t action, const void *arg)
3959 {
3960         /*
3961          * REQUIRE() checking is performed in isc_socket_sendto().
3962          */
3963         return (isc_socket_sendto(sock, region, task, action, arg, NULL,
3964                                   NULL));
3965 }
3966
3967 isc_result_t
3968 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
3969                   isc_task_t *task, isc_taskaction_t action, const void *arg,
3970                   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3971 {
3972         isc_socketevent_t *dev;
3973         isc_socketmgr_t *manager;
3974
3975         REQUIRE(VALID_SOCKET(sock));
3976         REQUIRE(region != NULL);
3977         REQUIRE(task != NULL);
3978         REQUIRE(action != NULL);
3979
3980         manager = sock->manager;
3981         REQUIRE(VALID_MANAGER(manager));
3982
3983         INSIST(sock->bound);
3984
3985         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3986         if (dev == NULL) {
3987                 return (ISC_R_NOMEMORY);
3988         }
3989
3990         dev->region = *region;
3991
3992         return (socket_send(sock, dev, task, address, pktinfo, 0));
3993 }
3994
3995 isc_result_t
3996 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
3997                  isc_task_t *task, isc_taskaction_t action, const void *arg)
3998 {
3999         return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
4000                                    NULL));
4001 }
4002
4003 isc_result_t
4004 isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
4005                    isc_task_t *task, isc_taskaction_t action, const void *arg,
4006                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4007 {
4008         isc_socketevent_t *dev;
4009         isc_socketmgr_t *manager;
4010         unsigned int iocount;
4011         isc_buffer_t *buffer;
4012
4013         REQUIRE(VALID_SOCKET(sock));
4014         REQUIRE(buflist != NULL);
4015         REQUIRE(!ISC_LIST_EMPTY(*buflist));
4016         REQUIRE(task != NULL);
4017         REQUIRE(action != NULL);
4018
4019         manager = sock->manager;
4020         REQUIRE(VALID_MANAGER(manager));
4021
4022         iocount = isc_bufferlist_usedcount(buflist);
4023         REQUIRE(iocount > 0);
4024
4025         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4026         if (dev == NULL) {
4027                 return (ISC_R_NOMEMORY);
4028         }
4029
4030         /*
4031          * Move each buffer from the passed in list to our internal one.
4032          */
4033         buffer = ISC_LIST_HEAD(*buflist);
4034         while (buffer != NULL) {
4035                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
4036                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4037                 buffer = ISC_LIST_HEAD(*buflist);
4038         }
4039
4040         return (socket_send(sock, dev, task, address, pktinfo, 0));
4041 }
4042
4043 isc_result_t
4044 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
4045                    isc_task_t *task,
4046                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4047                    isc_socketevent_t *event, unsigned int flags)
4048 {
4049         REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
4050         if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
4051                 REQUIRE(sock->type == isc_sockettype_udp);
4052         event->ev_sender = sock;
4053         event->result = ISC_R_UNEXPECTED;
4054         ISC_LIST_INIT(event->bufferlist);
4055         event->region = *region;
4056         event->n = 0;
4057         event->offset = 0;
4058         event->attributes = 0;
4059
4060         return (socket_send(sock, event, task, address, pktinfo, flags));
4061 }
4062
4063 void
4064 isc_socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
4065 #ifdef ISC_PLATFORM_HAVESYSUNH
4066         int s;
4067         struct stat sb;
4068         char strbuf[ISC_STRERRORSIZE];
4069
4070         if (sockaddr->type.sa.sa_family != AF_UNIX)
4071                 return;
4072
4073 #ifndef S_ISSOCK
4074 #if defined(S_IFMT) && defined(S_IFSOCK)
4075 #define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
4076 #elif defined(_S_IFMT) && defined(S_IFSOCK)
4077 #define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
4078 #endif
4079 #endif
4080
4081 #ifndef S_ISFIFO
4082 #if defined(S_IFMT) && defined(S_IFIFO)
4083 #define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
4084 #elif defined(_S_IFMT) && defined(S_IFIFO)
4085 #define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
4086 #endif
4087 #endif
4088
4089 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4090 #error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4091 #endif
4092
4093 #ifndef S_ISFIFO
4094 #define S_ISFIFO(mode) 0
4095 #endif
4096
4097 #ifndef S_ISSOCK
4098 #define S_ISSOCK(mode) 0
4099 #endif
4100
4101         if (active) {
4102                 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4103                         isc__strerror(errno, strbuf, sizeof(strbuf));
4104                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4105                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4106                                       "isc_socket_cleanunix: stat(%s): %s",
4107                                       sockaddr->type.sunix.sun_path, strbuf);
4108                         return;
4109                 }
4110                 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4111                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4112                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4113                                       "isc_socket_cleanunix: %s: not a socket",
4114                                       sockaddr->type.sunix.sun_path);
4115                         return;
4116                 }
4117                 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4118                         isc__strerror(errno, strbuf, sizeof(strbuf));
4119                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4120                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4121                                       "isc_socket_cleanunix: unlink(%s): %s",
4122                                       sockaddr->type.sunix.sun_path, strbuf);
4123                 }
4124                 return;
4125         }
4126
4127         s = socket(AF_UNIX, SOCK_STREAM, 0);
4128         if (s < 0) {
4129                 isc__strerror(errno, strbuf, sizeof(strbuf));
4130                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4131                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4132                               "isc_socket_cleanunix: socket(%s): %s",
4133                               sockaddr->type.sunix.sun_path, strbuf);
4134                 return;
4135         }
4136
4137         if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4138                 switch (errno) {
4139                 case ENOENT:    /* We exited cleanly last time */
4140                         break;
4141                 default:
4142                         isc__strerror(errno, strbuf, sizeof(strbuf));
4143                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4144                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4145                                       "isc_socket_cleanunix: stat(%s): %s",
4146                                       sockaddr->type.sunix.sun_path, strbuf);
4147                         break;
4148                 }
4149                 goto cleanup;
4150         }
4151
4152         if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4153                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4154                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4155                               "isc_socket_cleanunix: %s: not a socket",
4156                               sockaddr->type.sunix.sun_path);
4157                 goto cleanup;
4158         }
4159
4160         if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
4161                     sizeof(sockaddr->type.sunix)) < 0) {
4162                 switch (errno) {
4163                 case ECONNREFUSED:
4164                 case ECONNRESET:
4165                         if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4166                                 isc__strerror(errno, strbuf, sizeof(strbuf));
4167                                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4168                                               ISC_LOGMODULE_SOCKET,
4169                                               ISC_LOG_WARNING,
4170                                               "isc_socket_cleanunix: "
4171                                               "unlink(%s): %s",
4172                                               sockaddr->type.sunix.sun_path,
4173                                               strbuf);
4174                         }
4175                         break;
4176                 default:
4177                         isc__strerror(errno, strbuf, sizeof(strbuf));
4178                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4179                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4180                                       "isc_socket_cleanunix: connect(%s): %s",
4181                                       sockaddr->type.sunix.sun_path, strbuf);
4182                         break;
4183                 }
4184         }
4185  cleanup:
4186         close(s);
4187 #else
4188         UNUSED(sockaddr);
4189         UNUSED(active);
4190 #endif
4191 }
4192
4193 isc_result_t
4194 isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
4195                     isc_uint32_t owner, isc_uint32_t group)
4196 {
4197 #ifdef ISC_PLATFORM_HAVESYSUNH
4198         isc_result_t result = ISC_R_SUCCESS;
4199         char strbuf[ISC_STRERRORSIZE];
4200         char path[sizeof(sockaddr->type.sunix.sun_path)];
4201 #ifdef NEED_SECURE_DIRECTORY
4202         char *slash;
4203 #endif
4204
4205         REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4206         INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4207         strcpy(path, sockaddr->type.sunix.sun_path);
4208
4209 #ifdef NEED_SECURE_DIRECTORY
4210         slash = strrchr(path, '/');
4211         if (slash != NULL) {
4212                 if (slash != path)
4213                         *slash = '\0';
4214                 else
4215                         strcpy(path, "/");
4216         } else
4217                 strcpy(path, ".");
4218 #endif
4219
4220         if (chmod(path, perm) < 0) {
4221                 isc__strerror(errno, strbuf, sizeof(strbuf));
4222                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4223                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4224                               "isc_socket_permunix: chmod(%s, %d): %s",
4225                               path, perm, strbuf);
4226                 result = ISC_R_FAILURE;
4227         }
4228         if (chown(path, owner, group) < 0) {
4229                 isc__strerror(errno, strbuf, sizeof(strbuf));
4230                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4231                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4232                               "isc_socket_permunix: chown(%s, %d, %d): %s",
4233                               path, owner, group,
4234                               strbuf);
4235                 result = ISC_R_FAILURE;
4236         }
4237         return (result);
4238 #else
4239         UNUSED(sockaddr);
4240         UNUSED(perm);
4241         UNUSED(owner);
4242         UNUSED(group);
4243         return (ISC_R_NOTIMPLEMENTED);
4244 #endif
4245 }
4246
4247 isc_result_t
4248 isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
4249                 unsigned int options) {
4250         char strbuf[ISC_STRERRORSIZE];
4251         int on = 1;
4252
4253         LOCK(&sock->lock);
4254
4255         INSIST(!sock->bound);
4256
4257         if (sock->pf != sockaddr->type.sa.sa_family) {
4258                 UNLOCK(&sock->lock);
4259                 return (ISC_R_FAMILYMISMATCH);
4260         }
4261         /*
4262          * Only set SO_REUSEADDR when we want a specific port.
4263          */
4264 #ifdef AF_UNIX
4265         if (sock->pf == AF_UNIX)
4266                 goto bind_socket;
4267 #endif
4268         if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4269             isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
4270             setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4271                        sizeof(on)) < 0) {
4272                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4273                                  "setsockopt(%d) %s", sock->fd,
4274                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4275                                                 ISC_MSG_FAILED, "failed"));
4276                 /* Press on... */
4277         }
4278 #ifdef AF_UNIX
4279  bind_socket:
4280 #endif
4281         if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
4282                 UNLOCK(&sock->lock);
4283                 switch (errno) {
4284                 case EACCES:
4285                         return (ISC_R_NOPERM);
4286                 case EADDRNOTAVAIL:
4287                         return (ISC_R_ADDRNOTAVAIL);
4288                 case EADDRINUSE:
4289                         return (ISC_R_ADDRINUSE);
4290                 case EINVAL:
4291                         return (ISC_R_BOUND);
4292                 default:
4293                         isc__strerror(errno, strbuf, sizeof(strbuf));
4294                         UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
4295                                          strbuf);
4296                         return (ISC_R_UNEXPECTED);
4297                 }
4298         }
4299
4300         socket_log(sock, sockaddr, TRACE,
4301                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
4302         sock->bound = 1;
4303
4304         UNLOCK(&sock->lock);
4305         return (ISC_R_SUCCESS);
4306 }
4307
4308 isc_result_t
4309 isc_socket_filter(isc_socket_t *sock, const char *filter) {
4310 #ifdef SO_ACCEPTFILTER
4311         char strbuf[ISC_STRERRORSIZE];
4312         struct accept_filter_arg afa;
4313 #else
4314         UNUSED(sock);
4315         UNUSED(filter);
4316 #endif
4317
4318         REQUIRE(VALID_SOCKET(sock));
4319
4320 #ifdef SO_ACCEPTFILTER
4321         bzero(&afa, sizeof(afa));
4322         strncpy(afa.af_name, filter, sizeof(afa.af_name));
4323         if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
4324                          &afa, sizeof(afa)) == -1) {
4325                 isc__strerror(errno, strbuf, sizeof(strbuf));
4326                 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
4327                            ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
4328                            strbuf);
4329                 return (ISC_R_FAILURE);
4330         }
4331         return (ISC_R_SUCCESS);
4332 #else
4333         return (ISC_R_NOTIMPLEMENTED);
4334 #endif
4335 }
4336
4337 /*
4338  * Set up to listen on a given socket.  We do this by creating an internal
4339  * event that will be dispatched when the socket has read activity.  The
4340  * watcher will send the internal event to the task when there is a new
4341  * connection.
4342  *
4343  * Unlike in read, we don't preallocate a done event here.  Every time there
4344  * is a new connection we'll have to allocate a new one anyway, so we might
4345  * as well keep things simple rather than having to track them.
4346  */
4347 isc_result_t
4348 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
4349         char strbuf[ISC_STRERRORSIZE];
4350
4351         REQUIRE(VALID_SOCKET(sock));
4352
4353         LOCK(&sock->lock);
4354
4355         REQUIRE(!sock->listener);
4356         REQUIRE(sock->bound);
4357         REQUIRE(sock->type == isc_sockettype_tcp ||
4358                 sock->type == isc_sockettype_unix);
4359
4360         if (backlog == 0)
4361                 backlog = SOMAXCONN;
4362
4363         if (listen(sock->fd, (int)backlog) < 0) {
4364                 UNLOCK(&sock->lock);
4365                 isc__strerror(errno, strbuf, sizeof(strbuf));
4366
4367                 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
4368
4369                 return (ISC_R_UNEXPECTED);
4370         }
4371
4372         sock->listener = 1;
4373
4374         UNLOCK(&sock->lock);
4375         return (ISC_R_SUCCESS);
4376 }
4377
4378 /*
4379  * This should try to do aggressive accept() XXXMLG
4380  */
4381 isc_result_t
4382 isc_socket_accept(isc_socket_t *sock,
4383                   isc_task_t *task, isc_taskaction_t action, const void *arg)
4384 {
4385         isc_socket_newconnev_t *dev;
4386         isc_socketmgr_t *manager;
4387         isc_task_t *ntask = NULL;
4388         isc_socket_t *nsock;
4389         isc_result_t result;
4390         isc_boolean_t do_poke = ISC_FALSE;
4391
4392         REQUIRE(VALID_SOCKET(sock));
4393         manager = sock->manager;
4394         REQUIRE(VALID_MANAGER(manager));
4395
4396         LOCK(&sock->lock);
4397
4398         REQUIRE(sock->listener);
4399
4400         /*
4401          * Sender field is overloaded here with the task we will be sending
4402          * this event to.  Just before the actual event is delivered the
4403          * actual ev_sender will be touched up to be the socket.
4404          */
4405         dev = (isc_socket_newconnev_t *)
4406                 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
4407                                    action, arg, sizeof(*dev));
4408         if (dev == NULL) {
4409                 UNLOCK(&sock->lock);
4410                 return (ISC_R_NOMEMORY);
4411         }
4412         ISC_LINK_INIT(dev, ev_link);
4413
4414         result = allocate_socket(manager, sock->type, &nsock);
4415         if (result != ISC_R_SUCCESS) {
4416                 isc_event_free(ISC_EVENT_PTR(&dev));
4417                 UNLOCK(&sock->lock);
4418                 return (result);
4419         }
4420
4421         /*
4422          * Attach to socket and to task.
4423          */
4424         isc_task_attach(task, &ntask);
4425         nsock->references++;
4426
4427         dev->ev_sender = ntask;
4428         dev->newsocket = nsock;
4429
4430         /*
4431          * Poke watcher here.  We still have the socket locked, so there
4432          * is no race condition.  We will keep the lock for such a short
4433          * bit of time waking it up now or later won't matter all that much.
4434          */
4435         if (ISC_LIST_EMPTY(sock->accept_list))
4436                 do_poke = ISC_TRUE;
4437
4438         ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
4439
4440         if (do_poke)
4441                 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
4442
4443         UNLOCK(&sock->lock);
4444         return (ISC_R_SUCCESS);
4445 }
4446
4447 isc_result_t
4448 isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
4449                    isc_task_t *task, isc_taskaction_t action, const void *arg)
4450 {
4451         isc_socket_connev_t *dev;
4452         isc_task_t *ntask = NULL;
4453         isc_socketmgr_t *manager;
4454         int cc;
4455         char strbuf[ISC_STRERRORSIZE];
4456
4457         REQUIRE(VALID_SOCKET(sock));
4458         REQUIRE(addr != NULL);
4459         REQUIRE(task != NULL);
4460         REQUIRE(action != NULL);
4461
4462         manager = sock->manager;
4463         REQUIRE(VALID_MANAGER(manager));
4464         REQUIRE(addr != NULL);
4465
4466         if (isc_sockaddr_ismulticast(addr))
4467                 return (ISC_R_MULTICAST);
4468
4469         LOCK(&sock->lock);
4470
4471         REQUIRE(!sock->connecting);
4472
4473         dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
4474                                                         ISC_SOCKEVENT_CONNECT,
4475                                                         action, arg,
4476                                                         sizeof(*dev));
4477         if (dev == NULL) {
4478                 UNLOCK(&sock->lock);
4479                 return (ISC_R_NOMEMORY);
4480         }
4481         ISC_LINK_INIT(dev, ev_link);
4482
4483         /*
4484          * Try to do the connect right away, as there can be only one
4485          * outstanding, and it might happen to complete.
4486          */
4487         sock->address = *addr;
4488         cc = connect(sock->fd, &addr->type.sa, addr->length);
4489         if (cc < 0) {
4490                 /*
4491                  * HP-UX "fails" to connect a UDP socket and sets errno to
4492                  * EINPROGRESS if it's non-blocking.  We'd rather regard this as
4493                  * a success and let the user detect it if it's really an error
4494                  * at the time of sending a packet on the socket.
4495                  */
4496                 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
4497                         cc = 0;
4498                         goto success;
4499                 }
4500                 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
4501                         goto queue;
4502
4503                 switch (errno) {
4504 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
4505                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
4506                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4507                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4508                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4509                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4510 #ifdef EHOSTDOWN
4511                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4512 #endif
4513                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4514                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4515                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4516                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4517                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4518 #undef ERROR_MATCH
4519                 }
4520
4521                 sock->connected = 0;
4522
4523                 isc__strerror(errno, strbuf, sizeof(strbuf));
4524                 UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", errno, strbuf);
4525
4526                 UNLOCK(&sock->lock);
4527                 isc_event_free(ISC_EVENT_PTR(&dev));
4528                 return (ISC_R_UNEXPECTED);
4529
4530         err_exit:
4531                 sock->connected = 0;
4532                 isc_task_send(task, ISC_EVENT_PTR(&dev));
4533
4534                 UNLOCK(&sock->lock);
4535                 return (ISC_R_SUCCESS);
4536         }
4537
4538         /*
4539          * If connect completed, fire off the done event.
4540          */
4541  success:
4542         if (cc == 0) {
4543                 sock->connected = 1;
4544                 sock->bound = 1;
4545                 dev->result = ISC_R_SUCCESS;
4546                 isc_task_send(task, ISC_EVENT_PTR(&dev));
4547
4548                 UNLOCK(&sock->lock);
4549                 return (ISC_R_SUCCESS);
4550         }
4551
4552  queue:
4553
4554         /*
4555          * Attach to task.
4556          */
4557         isc_task_attach(task, &ntask);
4558
4559         sock->connecting = 1;
4560
4561         dev->ev_sender = ntask;
4562
4563         /*
4564          * Poke watcher here.  We still have the socket locked, so there
4565          * is no race condition.  We will keep the lock for such a short
4566          * bit of time waking it up now or later won't matter all that much.
4567          */
4568         if (sock->connect_ev == NULL)
4569                 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
4570
4571         sock->connect_ev = dev;
4572
4573         UNLOCK(&sock->lock);
4574         return (ISC_R_SUCCESS);
4575 }
4576
4577 /*
4578  * Called when a socket with a pending connect() finishes.
4579  */
4580 static void
4581 internal_connect(isc_task_t *me, isc_event_t *ev) {
4582         isc_socket_t *sock;
4583         isc_socket_connev_t *dev;
4584         isc_task_t *task;
4585         int cc;
4586         ISC_SOCKADDR_LEN_T optlen;
4587         char strbuf[ISC_STRERRORSIZE];
4588         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
4589
4590         UNUSED(me);
4591         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
4592
4593         sock = ev->ev_sender;
4594         INSIST(VALID_SOCKET(sock));
4595
4596         LOCK(&sock->lock);
4597
4598         /*
4599          * When the internal event was sent the reference count was bumped
4600          * to keep the socket around for us.  Decrement the count here.
4601          */
4602         INSIST(sock->references > 0);
4603         sock->references--;
4604         if (sock->references == 0) {
4605                 UNLOCK(&sock->lock);
4606                 destroy(&sock);
4607                 return;
4608         }
4609
4610         /*
4611          * Has this event been canceled?
4612          */
4613         dev = sock->connect_ev;
4614         if (dev == NULL) {
4615                 INSIST(!sock->connecting);
4616                 UNLOCK(&sock->lock);
4617                 return;
4618         }
4619
4620         INSIST(sock->connecting);
4621         sock->connecting = 0;
4622
4623         /*
4624          * Get any possible error status here.
4625          */
4626         optlen = sizeof(cc);
4627         if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
4628                        (void *)&cc, (void *)&optlen) < 0)
4629                 cc = errno;
4630         else
4631                 errno = cc;
4632
4633         if (errno != 0) {
4634                 /*
4635                  * If the error is EAGAIN, just re-select on this
4636                  * fd and pretend nothing strange happened.
4637                  */
4638                 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4639                         sock->connecting = 1;
4640                         select_poke(sock->manager, sock->fd,
4641                                     SELECT_POKE_CONNECT);
4642                         UNLOCK(&sock->lock);
4643
4644                         return;
4645                 }
4646
4647                 /*
4648                  * Translate other errors into ISC_R_* flavors.
4649                  */
4650                 switch (errno) {
4651 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
4652                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
4653                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4654                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4655                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4656                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4657 #ifdef EHOSTDOWN
4658                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4659 #endif
4660                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4661                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4662                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4663                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4664                         ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
4665                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4666 #undef ERROR_MATCH
4667                 default:
4668                         dev->result = ISC_R_UNEXPECTED;
4669                         isc_sockaddr_format(&sock->address, peerbuf,
4670                                             sizeof(peerbuf));
4671                         isc__strerror(errno, strbuf, sizeof(strbuf));
4672                         UNEXPECTED_ERROR(__FILE__, __LINE__,
4673                                          "internal_connect: connect(%s) %s",
4674                                          peerbuf, strbuf);
4675                 }
4676         } else {
4677                 dev->result = ISC_R_SUCCESS;
4678                 sock->connected = 1;
4679                 sock->bound = 1;
4680         }
4681
4682         sock->connect_ev = NULL;
4683
4684         UNLOCK(&sock->lock);
4685
4686         task = dev->ev_sender;
4687         dev->ev_sender = sock;
4688         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
4689 }
4690
4691 isc_result_t
4692 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
4693         isc_result_t result;
4694
4695         REQUIRE(VALID_SOCKET(sock));
4696         REQUIRE(addressp != NULL);
4697
4698         LOCK(&sock->lock);
4699
4700         if (sock->connected) {
4701                 *addressp = sock->address;
4702                 result = ISC_R_SUCCESS;
4703         } else {
4704                 result = ISC_R_NOTCONNECTED;
4705         }
4706
4707         UNLOCK(&sock->lock);
4708
4709         return (result);
4710 }
4711
4712 isc_result_t
4713 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
4714         ISC_SOCKADDR_LEN_T len;
4715         isc_result_t result;
4716         char strbuf[ISC_STRERRORSIZE];
4717
4718         REQUIRE(VALID_SOCKET(sock));
4719         REQUIRE(addressp != NULL);
4720
4721         LOCK(&sock->lock);
4722
4723         if (!sock->bound) {
4724                 result = ISC_R_NOTBOUND;
4725                 goto out;
4726         }
4727
4728         result = ISC_R_SUCCESS;
4729
4730         len = sizeof(addressp->type);
4731         if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
4732                 isc__strerror(errno, strbuf, sizeof(strbuf));
4733                 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
4734                                  strbuf);
4735                 result = ISC_R_UNEXPECTED;
4736                 goto out;
4737         }
4738         addressp->length = (unsigned int)len;
4739
4740  out:
4741         UNLOCK(&sock->lock);
4742
4743         return (result);
4744 }
4745
4746 /*
4747  * Run through the list of events on this socket, and cancel the ones
4748  * queued for task "task" of type "how".  "how" is a bitmask.
4749  */
4750 void
4751 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
4752
4753         REQUIRE(VALID_SOCKET(sock));
4754
4755         /*
4756          * Quick exit if there is nothing to do.  Don't even bother locking
4757          * in this case.
4758          */
4759         if (how == 0)
4760                 return;
4761
4762         LOCK(&sock->lock);
4763
4764         /*
4765          * All of these do the same thing, more or less.
4766          * Each will:
4767          *      o If the internal event is marked as "posted" try to
4768          *        remove it from the task's queue.  If this fails, mark it
4769          *        as canceled instead, and let the task clean it up later.
4770          *      o For each I/O request for that task of that type, post
4771          *        its done event with status of "ISC_R_CANCELED".
4772          *      o Reset any state needed.
4773          */
4774         if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
4775             && !ISC_LIST_EMPTY(sock->recv_list)) {
4776                 isc_socketevent_t      *dev;
4777                 isc_socketevent_t      *next;
4778                 isc_task_t             *current_task;
4779
4780                 dev = ISC_LIST_HEAD(sock->recv_list);
4781
4782                 while (dev != NULL) {
4783                         current_task = dev->ev_sender;
4784                         next = ISC_LIST_NEXT(dev, ev_link);
4785
4786                         if ((task == NULL) || (task == current_task)) {
4787                                 dev->result = ISC_R_CANCELED;
4788                                 send_recvdone_event(sock, &dev);
4789                         }
4790                         dev = next;
4791                 }
4792         }
4793
4794         if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
4795             && !ISC_LIST_EMPTY(sock->send_list)) {
4796                 isc_socketevent_t      *dev;
4797                 isc_socketevent_t      *next;
4798                 isc_task_t             *current_task;
4799
4800                 dev = ISC_LIST_HEAD(sock->send_list);
4801
4802                 while (dev != NULL) {
4803                         current_task = dev->ev_sender;
4804                         next = ISC_LIST_NEXT(dev, ev_link);
4805
4806                         if ((task == NULL) || (task == current_task)) {
4807                                 dev->result = ISC_R_CANCELED;
4808                                 send_senddone_event(sock, &dev);
4809                         }
4810                         dev = next;
4811                 }
4812         }
4813
4814         if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
4815             && !ISC_LIST_EMPTY(sock->accept_list)) {
4816                 isc_socket_newconnev_t *dev;
4817                 isc_socket_newconnev_t *next;
4818                 isc_task_t             *current_task;
4819
4820                 dev = ISC_LIST_HEAD(sock->accept_list);
4821                 while (dev != NULL) {
4822                         current_task = dev->ev_sender;
4823                         next = ISC_LIST_NEXT(dev, ev_link);
4824
4825                         if ((task == NULL) || (task == current_task)) {
4826
4827                                 ISC_LIST_UNLINK(sock->accept_list, dev,
4828                                                 ev_link);
4829
4830                                 dev->newsocket->references--;
4831                                 free_socket(&dev->newsocket);
4832
4833                                 dev->result = ISC_R_CANCELED;
4834                                 dev->ev_sender = sock;
4835                                 isc_task_sendanddetach(&current_task,
4836                                                        ISC_EVENT_PTR(&dev));
4837                         }
4838
4839                         dev = next;
4840                 }
4841         }
4842
4843         /*
4844          * Connecting is not a list.
4845          */
4846         if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
4847             && sock->connect_ev != NULL) {
4848                 isc_socket_connev_t    *dev;
4849                 isc_task_t             *current_task;
4850
4851                 INSIST(sock->connecting);
4852                 sock->connecting = 0;
4853
4854                 dev = sock->connect_ev;
4855                 current_task = dev->ev_sender;
4856
4857                 if ((task == NULL) || (task == current_task)) {
4858                         sock->connect_ev = NULL;
4859
4860                         dev->result = ISC_R_CANCELED;
4861                         dev->ev_sender = sock;
4862                         isc_task_sendanddetach(&current_task,
4863                                                ISC_EVENT_PTR(&dev));
4864                 }
4865         }
4866
4867         UNLOCK(&sock->lock);
4868 }
4869
4870 isc_sockettype_t
4871 isc_socket_gettype(isc_socket_t *sock) {
4872         REQUIRE(VALID_SOCKET(sock));
4873
4874         return (sock->type);
4875 }
4876
4877 isc_boolean_t
4878 isc_socket_isbound(isc_socket_t *sock) {
4879         isc_boolean_t val;
4880
4881         LOCK(&sock->lock);
4882         val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
4883         UNLOCK(&sock->lock);
4884
4885         return (val);
4886 }
4887
4888 void
4889 isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
4890 #if defined(IPV6_V6ONLY)
4891         int onoff = yes ? 1 : 0;
4892 #else
4893         UNUSED(yes);
4894         UNUSED(sock);
4895 #endif
4896
4897         REQUIRE(VALID_SOCKET(sock));
4898
4899 #ifdef IPV6_V6ONLY
4900         if (sock->pf == AF_INET6) {
4901                 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
4902                                (void *)&onoff, sizeof(int)) < 0) {
4903                         char strbuf[ISC_STRERRORSIZE];
4904
4905                         UNEXPECTED_ERROR(__FILE__, __LINE__,
4906                                          "setsockopt(%d, IPV6_V6ONLY) "
4907                                          "%s: %s", sock->fd,
4908                                          isc_msgcat_get(isc_msgcat,
4909                                                         ISC_MSGSET_GENERAL,
4910                                                         ISC_MSG_FAILED,
4911                                                         "failed"),
4912                                          strbuf);
4913                 }
4914         }
4915         FIX_IPV6_RECVPKTINFO(sock);     /* AIX */
4916 #endif
4917 }
4918
4919 #ifndef ISC_PLATFORM_USETHREADS
4920 /* In our assumed scenario, we can simply use a single static object. */
4921 static isc_socketwait_t swait_private;
4922
4923 int
4924 isc__socketmgr_waitevents(struct timeval *tvp, isc_socketwait_t **swaitp) {
4925         int n;
4926 #ifdef USE_KQUEUE
4927         struct timespec ts, *tsp;
4928 #endif
4929 #ifdef USE_EPOLL
4930         int timeout;
4931 #endif
4932 #ifdef USE_DEVPOLL
4933         struct dvpoll dvp;
4934 #endif
4935
4936         REQUIRE(swaitp != NULL && *swaitp == NULL);
4937
4938         if (socketmgr == NULL)
4939                 return (0);
4940
4941 #ifdef USE_KQUEUE
4942         if (tvp != NULL) {
4943                 ts.tv_sec = tvp->tv_sec;
4944                 ts.tv_nsec = tvp->tv_usec * 1000;
4945                 tsp = &ts;
4946         } else
4947                 tsp = NULL;
4948         swait_private.nevents = kevent(socketmgr->kqueue_fd, NULL, 0,
4949                                        socketmgr->events, socketmgr->nevents,
4950                                        tsp);
4951         n = swait_private.nevents;
4952 #elif defined(USE_EPOLL)
4953         if (tvp != NULL)
4954                 timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000;
4955         else
4956                 timeout = -1;
4957         swait_private.nevents = epoll_wait(socketmgr->epoll_fd,
4958                                            socketmgr->events,
4959                                            socketmgr->nevents, timeout);
4960         n = swait_private.nevents;
4961 #elif defined(USE_DEVPOLL)
4962         dvp.dp_fds = socketmgr->events;
4963         dvp.dp_nfds = socketmgr->nevents;
4964         if (tvp != NULL) {
4965                 dvp.dp_timeout = tvp->tv_sec * 1000 +
4966                         (tvp->tv_usec + 999) / 1000;
4967         } else
4968                 dvp.dp_timeout = -1;
4969         swait_private.nevents = ioctl(socketmgr->devpoll_fd, DP_POLL, &dvp);
4970         n = swait_private.nevents;
4971 #elif defined(USE_SELECT)
4972         memcpy(socketmgr->read_fds_copy, socketmgr->read_fds,
4973                socketmgr->fd_bufsize);
4974         memcpy(socketmgr->write_fds_copy, socketmgr->write_fds,
4975                socketmgr->fd_bufsize);
4976
4977         swait_private.readset = socketmgr->read_fds_copy;
4978         swait_private.writeset = socketmgr->write_fds_copy;
4979         swait_private.maxfd = socketmgr->maxfd + 1;
4980
4981         n = select(swait_private.maxfd, swait_private.readset,
4982                    swait_private.writeset, NULL, tvp);
4983 #endif
4984
4985         *swaitp = &swait_private;
4986         return (n);
4987 }
4988
4989 isc_result_t
4990 isc__socketmgr_dispatch(isc_socketwait_t *swait) {
4991         REQUIRE(swait == &swait_private);
4992
4993         if (socketmgr == NULL)
4994                 return (ISC_R_NOTFOUND);
4995
4996 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
4997         (void)process_fds(socketmgr, socketmgr->events, swait->nevents);
4998         return (ISC_R_SUCCESS);
4999 #elif defined(USE_SELECT)
5000         process_fds(socketmgr, swait->maxfd, swait->readset, swait->writeset);
5001         return (ISC_R_SUCCESS);
5002 #endif
5003 }
5004 #endif /* ISC_PLATFORM_USETHREADS */