]> CyberLeo.Net >> Repos - FreeBSD/stable/8.git/blob - contrib/bind9/lib/isc/unix/socket.c
Update to BIND 9.6.3, the latest from ISC on the 9.6 branch.
[FreeBSD/stable/8.git] / contrib / bind9 / lib / isc / unix / socket.c
1 /*
2  * Copyright (C) 2004-2010  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1998-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: socket.c,v 1.308.12.17 2010-12-22 03:28:13 marka Exp $ */
19
20 /*! \file */
21
22 #include <config.h>
23
24 #include <sys/param.h>
25 #include <sys/types.h>
26 #include <sys/socket.h>
27 #include <sys/stat.h>
28 #include <sys/time.h>
29 #include <sys/uio.h>
30
31 #include <errno.h>
32 #include <fcntl.h>
33 #include <stddef.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37
38 #include <isc/buffer.h>
39 #include <isc/bufferlist.h>
40 #include <isc/condition.h>
41 #include <isc/formatcheck.h>
42 #include <isc/list.h>
43 #include <isc/log.h>
44 #include <isc/mem.h>
45 #include <isc/msgs.h>
46 #include <isc/mutex.h>
47 #include <isc/net.h>
48 #include <isc/once.h>
49 #include <isc/platform.h>
50 #include <isc/print.h>
51 #include <isc/region.h>
52 #include <isc/socket.h>
53 #include <isc/stats.h>
54 #include <isc/strerror.h>
55 #include <isc/task.h>
56 #include <isc/thread.h>
57 #include <isc/util.h>
58 #include <isc/xml.h>
59
60 #ifdef ISC_PLATFORM_HAVESYSUNH
61 #include <sys/un.h>
62 #endif
63 #ifdef ISC_PLATFORM_HAVEKQUEUE
64 #include <sys/event.h>
65 #endif
66 #ifdef ISC_PLATFORM_HAVEEPOLL
67 #include <sys/epoll.h>
68 #endif
69 #ifdef ISC_PLATFORM_HAVEDEVPOLL
70 #if defined(HAVE_SYS_DEVPOLL_H)
71 #include <sys/devpoll.h>
72 #elif defined(HAVE_DEVPOLL_H)
73 #include <devpoll.h>
74 #endif
75 #endif
76
77 #include "errno2result.h"
78
79 #ifndef ISC_PLATFORM_USETHREADS
80 #include "socket_p.h"
81 #endif /* ISC_PLATFORM_USETHREADS */
82
83 #if defined(SO_BSDCOMPAT) && defined(__linux__)
84 #include <sys/utsname.h>
85 #endif
86
87 /*%
88  * Choose the most preferable multiplex method.
89  */
90 #ifdef ISC_PLATFORM_HAVEKQUEUE
91 #define USE_KQUEUE
92 #elif defined (ISC_PLATFORM_HAVEEPOLL)
93 #define USE_EPOLL
94 #elif defined (ISC_PLATFORM_HAVEDEVPOLL)
95 #define USE_DEVPOLL
96 typedef struct {
97         unsigned int want_read : 1,
98                 want_write : 1;
99 } pollinfo_t;
100 #else
101 #define USE_SELECT
102 #endif  /* ISC_PLATFORM_HAVEKQUEUE */
103
104 #ifndef ISC_PLATFORM_USETHREADS
105 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
106 struct isc_socketwait {
107         int nevents;
108 };
109 #elif defined (USE_SELECT)
110 struct isc_socketwait {
111         fd_set *readset;
112         fd_set *writeset;
113         int nfds;
114         int maxfd;
115 };
116 #endif  /* USE_KQUEUE */
117 #endif /* !ISC_PLATFORM_USETHREADS */
118
119 /*%
120  * Maximum number of allowable open sockets.  This is also the maximum
121  * allowable socket file descriptor.
122  *
123  * Care should be taken before modifying this value for select():
124  * The API standard doesn't ensure select() accept more than (the system default
125  * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
126  * the vast majority of cases.  This constant should therefore be increased only
127  * when absolutely necessary and possible, i.e., the server is exhausting all
128  * available file descriptors (up to FD_SETSIZE) and the select() function
129  * and FD_xxx macros support larger values than FD_SETSIZE (which may not
130  * always by true, but we keep using some of them to ensure as much
131  * portability as possible).  Note also that overall server performance
132  * may be rather worsened with a larger value of this constant due to
133  * inherent scalability problems of select().
134  *
135  * As a special note, this value shouldn't have to be touched if
136  * this is a build for an authoritative only DNS server.
137  */
138 #ifndef ISC_SOCKET_MAXSOCKETS
139 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
140 #define ISC_SOCKET_MAXSOCKETS 4096
141 #elif defined(USE_SELECT)
142 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
143 #endif  /* USE_KQUEUE... */
144 #endif  /* ISC_SOCKET_MAXSOCKETS */
145
146 #ifdef USE_SELECT
147 /*%
148  * Mac OS X needs a special definition to support larger values in select().
149  * We always define this because a larger value can be specified run-time.
150  */
151 #ifdef __APPLE__
152 #define _DARWIN_UNLIMITED_SELECT
153 #endif  /* __APPLE__ */
154 #endif  /* USE_SELECT */
155
156 #ifdef ISC_SOCKET_USE_POLLWATCH
157 /*%
158  * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
159  * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
160  * some of the specified FD.  The idea is based on the observation that it's
161  * likely for a busy server to keep receiving packets.  It specifically works
162  * as follows: the socket watcher is first initialized with the state of
163  * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
164  * event occurs.  When it wakes up for a socket I/O event, it moves to the
165  * poll_active state, and sets the poll timeout to a short period
166  * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
167  * watcher goes to the poll_checking state with the same timeout period.
168  * In this state, the watcher tries to detect whether this is a break
169  * during intermittent events or the kernel bug is triggered.  If the next
170  * polling reports an event within the short period, the previous timeout is
171  * likely to be a kernel bug, and so the watcher goes back to the active state.
172  * Otherwise, it moves to the idle state again.
173  *
174  * It's not clear whether this is a thread-related bug, but since we've only
175  * seen this with threads, this workaround is used only when enabling threads.
176  */
177
178 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
179
180 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
181 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
182 #endif  /* ISC_SOCKET_POLLWATCH_TIMEOUT */
183 #endif  /* ISC_SOCKET_USE_POLLWATCH */
184
185 /*%
186  * Size of per-FD lock buckets.
187  */
188 #ifdef ISC_PLATFORM_USETHREADS
189 #define FDLOCK_COUNT            1024
190 #define FDLOCK_ID(fd)           ((fd) % FDLOCK_COUNT)
191 #else
192 #define FDLOCK_COUNT            1
193 #define FDLOCK_ID(fd)           0
194 #endif  /* ISC_PLATFORM_USETHREADS */
195
196 /*%
197  * Maximum number of events communicated with the kernel.  There should normally
198  * be no need for having a large number.
199  */
200 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
201 #ifndef ISC_SOCKET_MAXEVENTS
202 #define ISC_SOCKET_MAXEVENTS    64
203 #endif
204 #endif
205
206 /*%
207  * Some systems define the socket length argument as an int, some as size_t,
208  * some as socklen_t.  This is here so it can be easily changed if needed.
209  */
210 #ifndef ISC_SOCKADDR_LEN_T
211 #define ISC_SOCKADDR_LEN_T unsigned int
212 #endif
213
214 /*%
215  * Define what the possible "soft" errors can be.  These are non-fatal returns
216  * of various network related functions, like recv() and so on.
217  *
218  * For some reason, BSDI (and perhaps others) will sometimes return <0
219  * from recv() but will have errno==0.  This is broken, but we have to
220  * work around it here.
221  */
222 #define SOFT_ERROR(e)   ((e) == EAGAIN || \
223                          (e) == EWOULDBLOCK || \
224                          (e) == EINTR || \
225                          (e) == 0)
226
227 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
228
229 /*!<
230  * DLVL(90)  --  Function entry/exit and other tracing.
231  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
232  * DLVL(60)  --  Socket data send/receive
233  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
234  * DLVL(20)  --  Socket creation/destruction.
235  */
236 #define TRACE_LEVEL             90
237 #define CORRECTNESS_LEVEL       70
238 #define IOEVENT_LEVEL           60
239 #define EVENT_LEVEL             50
240 #define CREATION_LEVEL          20
241
242 #define TRACE           DLVL(TRACE_LEVEL)
243 #define CORRECTNESS     DLVL(CORRECTNESS_LEVEL)
244 #define IOEVENT         DLVL(IOEVENT_LEVEL)
245 #define EVENT           DLVL(EVENT_LEVEL)
246 #define CREATION        DLVL(CREATION_LEVEL)
247
248 typedef isc_event_t intev_t;
249
250 #define SOCKET_MAGIC            ISC_MAGIC('I', 'O', 'i', 'o')
251 #define VALID_SOCKET(t)         ISC_MAGIC_VALID(t, SOCKET_MAGIC)
252
253 /*!
254  * IPv6 control information.  If the socket is an IPv6 socket we want
255  * to collect the destination address and interface so the client can
256  * set them on outgoing packets.
257  */
258 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
259 #ifndef USE_CMSG
260 #define USE_CMSG        1
261 #endif
262 #endif
263
264 /*%
265  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
266  * a setsockopt() like interface to request timestamps, and if the OS
267  * doesn't do it for us, call gettimeofday() on every UDP receive?
268  */
269 #ifdef SO_TIMESTAMP
270 #ifndef USE_CMSG
271 #define USE_CMSG        1
272 #endif
273 #endif
274
275 /*%
276  * The size to raise the receive buffer to (from BIND 8).
277  */
278 #define RCVBUFSIZE (32*1024)
279
280 /*%
281  * The number of times a send operation is repeated if the result is EINTR.
282  */
283 #define NRETRIES 10
284
285 struct isc_socket {
286         /* Not locked. */
287         unsigned int            magic;
288         isc_socketmgr_t        *manager;
289         isc_mutex_t             lock;
290         isc_sockettype_t        type;
291         const isc_statscounter_t        *statsindex;
292
293         /* Locked by socket lock. */
294         ISC_LINK(isc_socket_t)  link;
295         unsigned int            references;
296         int                     fd;
297         int                     pf;
298         char                            name[16];
299         void *                          tag;
300
301         ISC_LIST(isc_socketevent_t)             send_list;
302         ISC_LIST(isc_socketevent_t)             recv_list;
303         ISC_LIST(isc_socket_newconnev_t)        accept_list;
304         isc_socket_connev_t                    *connect_ev;
305
306         /*
307          * Internal events.  Posted when a descriptor is readable or
308          * writable.  These are statically allocated and never freed.
309          * They will be set to non-purgable before use.
310          */
311         intev_t                 readable_ev;
312         intev_t                 writable_ev;
313
314         isc_sockaddr_t          peer_address;  /* remote address */
315
316         unsigned int            pending_recv : 1,
317                                 pending_send : 1,
318                                 pending_accept : 1,
319                                 listener : 1, /* listener socket */
320                                 connected : 1,
321                                 connecting : 1, /* connect pending */
322                                 bound : 1; /* bound to local addr */
323
324 #ifdef ISC_NET_RECVOVERFLOW
325         unsigned char           overflow; /* used for MSG_TRUNC fake */
326 #endif
327
328         char                    *recvcmsgbuf;
329         ISC_SOCKADDR_LEN_T      recvcmsgbuflen;
330         char                    *sendcmsgbuf;
331         ISC_SOCKADDR_LEN_T      sendcmsgbuflen;
332
333         void                    *fdwatcharg;
334         isc_sockfdwatch_t       fdwatchcb;
335         int                     fdwatchflags;
336         isc_task_t              *fdwatchtask;
337 };
338
339 #define SOCKET_MANAGER_MAGIC    ISC_MAGIC('I', 'O', 'm', 'g')
340 #define VALID_MANAGER(m)        ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
341
342 struct isc_socketmgr {
343         /* Not locked. */
344         unsigned int            magic;
345         isc_mem_t              *mctx;
346         isc_mutex_t             lock;
347         isc_mutex_t             *fdlock;
348         isc_stats_t             *stats;
349 #ifdef USE_KQUEUE
350         int                     kqueue_fd;
351         int                     nevents;
352         struct kevent           *events;
353 #endif  /* USE_KQUEUE */
354 #ifdef USE_EPOLL
355         int                     epoll_fd;
356         int                     nevents;
357         struct epoll_event      *events;
358 #endif  /* USE_EPOLL */
359 #ifdef USE_DEVPOLL
360         int                     devpoll_fd;
361         int                     nevents;
362         struct pollfd           *events;
363 #endif  /* USE_DEVPOLL */
364 #ifdef USE_SELECT
365         int                     fd_bufsize;
366 #endif  /* USE_SELECT */
367         unsigned int            maxsocks;
368 #ifdef ISC_PLATFORM_USETHREADS
369         int                     pipe_fds[2];
370 #endif
371
372         /* Locked by fdlock. */
373         isc_socket_t           **fds;
374         int                     *fdstate;
375 #ifdef USE_DEVPOLL
376         pollinfo_t              *fdpollinfo;
377 #endif
378
379         /* Locked by manager lock. */
380         ISC_LIST(isc_socket_t)  socklist;
381 #ifdef USE_SELECT
382         fd_set                  *read_fds;
383         fd_set                  *read_fds_copy;
384         fd_set                  *write_fds;
385         fd_set                  *write_fds_copy;
386         int                     maxfd;
387 #endif  /* USE_SELECT */
388         int                     reserved;       /* unlocked */
389 #ifdef ISC_PLATFORM_USETHREADS
390         isc_thread_t            watcher;
391         isc_condition_t         shutdown_ok;
392 #else /* ISC_PLATFORM_USETHREADS */
393         unsigned int            refs;
394 #endif /* ISC_PLATFORM_USETHREADS */
395 };
396
397 #ifndef ISC_PLATFORM_USETHREADS
398 static isc_socketmgr_t *socketmgr = NULL;
399 #endif /* ISC_PLATFORM_USETHREADS */
400
401 #define CLOSED                  0       /* this one must be zero */
402 #define MANAGED                 1
403 #define CLOSE_PENDING           2
404
405 /*
406  * send() and recv() iovec counts
407  */
408 #define MAXSCATTERGATHER_SEND   (ISC_SOCKET_MAXSCATTERGATHER)
409 #ifdef ISC_NET_RECVOVERFLOW
410 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER + 1)
411 #else
412 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER)
413 #endif
414
415 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
416 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
417 static void free_socket(isc_socket_t **);
418 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
419                                     isc_socket_t **);
420 static void destroy(isc_socket_t **);
421 static void internal_accept(isc_task_t *, isc_event_t *);
422 static void internal_connect(isc_task_t *, isc_event_t *);
423 static void internal_recv(isc_task_t *, isc_event_t *);
424 static void internal_send(isc_task_t *, isc_event_t *);
425 static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
426 static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
427 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
428 static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
429                               struct msghdr *, struct iovec *, size_t *);
430 static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
431                               struct msghdr *, struct iovec *, size_t *);
432 #ifdef ISC_PLATFORM_USETHREADS
433 static isc_boolean_t process_ctlfd(isc_socketmgr_t *manager);
434 #endif
435
436 #define SELECT_POKE_SHUTDOWN            (-1)
437 #define SELECT_POKE_NOTHING             (-2)
438 #define SELECT_POKE_READ                (-3)
439 #define SELECT_POKE_ACCEPT              (-3) /*%< Same as _READ */
440 #define SELECT_POKE_WRITE               (-4)
441 #define SELECT_POKE_CONNECT             (-4) /*%< Same as _WRITE */
442 #define SELECT_POKE_CLOSE               (-5)
443
444 #define SOCK_DEAD(s)                    ((s)->references == 0)
445
446 /*%
447  * Shortcut index arrays to get access to statistics counters.
448  */
449 enum {
450         STATID_OPEN = 0,
451         STATID_OPENFAIL = 1,
452         STATID_CLOSE = 2,
453         STATID_BINDFAIL = 3,
454         STATID_CONNECTFAIL = 4,
455         STATID_CONNECT = 5,
456         STATID_ACCEPTFAIL = 6,
457         STATID_ACCEPT = 7,
458         STATID_SENDFAIL = 8,
459         STATID_RECVFAIL = 9
460 };
461 static const isc_statscounter_t upd4statsindex[] = {
462         isc_sockstatscounter_udp4open,
463         isc_sockstatscounter_udp4openfail,
464         isc_sockstatscounter_udp4close,
465         isc_sockstatscounter_udp4bindfail,
466         isc_sockstatscounter_udp4connectfail,
467         isc_sockstatscounter_udp4connect,
468         -1,
469         -1,
470         isc_sockstatscounter_udp4sendfail,
471         isc_sockstatscounter_udp4recvfail
472 };
473 static const isc_statscounter_t upd6statsindex[] = {
474         isc_sockstatscounter_udp6open,
475         isc_sockstatscounter_udp6openfail,
476         isc_sockstatscounter_udp6close,
477         isc_sockstatscounter_udp6bindfail,
478         isc_sockstatscounter_udp6connectfail,
479         isc_sockstatscounter_udp6connect,
480         -1,
481         -1,
482         isc_sockstatscounter_udp6sendfail,
483         isc_sockstatscounter_udp6recvfail
484 };
485 static const isc_statscounter_t tcp4statsindex[] = {
486         isc_sockstatscounter_tcp4open,
487         isc_sockstatscounter_tcp4openfail,
488         isc_sockstatscounter_tcp4close,
489         isc_sockstatscounter_tcp4bindfail,
490         isc_sockstatscounter_tcp4connectfail,
491         isc_sockstatscounter_tcp4connect,
492         isc_sockstatscounter_tcp4acceptfail,
493         isc_sockstatscounter_tcp4accept,
494         isc_sockstatscounter_tcp4sendfail,
495         isc_sockstatscounter_tcp4recvfail
496 };
497 static const isc_statscounter_t tcp6statsindex[] = {
498         isc_sockstatscounter_tcp6open,
499         isc_sockstatscounter_tcp6openfail,
500         isc_sockstatscounter_tcp6close,
501         isc_sockstatscounter_tcp6bindfail,
502         isc_sockstatscounter_tcp6connectfail,
503         isc_sockstatscounter_tcp6connect,
504         isc_sockstatscounter_tcp6acceptfail,
505         isc_sockstatscounter_tcp6accept,
506         isc_sockstatscounter_tcp6sendfail,
507         isc_sockstatscounter_tcp6recvfail
508 };
509 static const isc_statscounter_t unixstatsindex[] = {
510         isc_sockstatscounter_unixopen,
511         isc_sockstatscounter_unixopenfail,
512         isc_sockstatscounter_unixclose,
513         isc_sockstatscounter_unixbindfail,
514         isc_sockstatscounter_unixconnectfail,
515         isc_sockstatscounter_unixconnect,
516         isc_sockstatscounter_unixacceptfail,
517         isc_sockstatscounter_unixaccept,
518         isc_sockstatscounter_unixsendfail,
519         isc_sockstatscounter_unixrecvfail
520 };
521 static const isc_statscounter_t fdwatchstatsindex[] = {
522         -1,
523         -1,
524         isc_sockstatscounter_fdwatchclose,
525         isc_sockstatscounter_fdwatchbindfail,
526         isc_sockstatscounter_fdwatchconnectfail,
527         isc_sockstatscounter_fdwatchconnect,
528         -1,
529         -1,
530         isc_sockstatscounter_fdwatchsendfail,
531         isc_sockstatscounter_fdwatchrecvfail
532 };
533
534 static void
535 manager_log(isc_socketmgr_t *sockmgr,
536             isc_logcategory_t *category, isc_logmodule_t *module, int level,
537             const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
538 static void
539 manager_log(isc_socketmgr_t *sockmgr,
540             isc_logcategory_t *category, isc_logmodule_t *module, int level,
541             const char *fmt, ...)
542 {
543         char msgbuf[2048];
544         va_list ap;
545
546         if (! isc_log_wouldlog(isc_lctx, level))
547                 return;
548
549         va_start(ap, fmt);
550         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
551         va_end(ap);
552
553         isc_log_write(isc_lctx, category, module, level,
554                       "sockmgr %p: %s", sockmgr, msgbuf);
555 }
556
557 static void
558 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
559            isc_logcategory_t *category, isc_logmodule_t *module, int level,
560            isc_msgcat_t *msgcat, int msgset, int message,
561            const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
562 static void
563 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
564            isc_logcategory_t *category, isc_logmodule_t *module, int level,
565            isc_msgcat_t *msgcat, int msgset, int message,
566            const char *fmt, ...)
567 {
568         char msgbuf[2048];
569         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
570         va_list ap;
571
572         if (! isc_log_wouldlog(isc_lctx, level))
573                 return;
574
575         va_start(ap, fmt);
576         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
577         va_end(ap);
578
579         if (address == NULL) {
580                 isc_log_iwrite(isc_lctx, category, module, level,
581                                msgcat, msgset, message,
582                                "socket %p: %s", sock, msgbuf);
583         } else {
584                 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
585                 isc_log_iwrite(isc_lctx, category, module, level,
586                                msgcat, msgset, message,
587                                "socket %p %s: %s", sock, peerbuf, msgbuf);
588         }
589 }
590
591 #if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
592     defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
593 /*
594  * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
595  * setting IPV6_V6ONLY.
596  */
597 static void
598 FIX_IPV6_RECVPKTINFO(isc_socket_t *sock)
599 {
600         char strbuf[ISC_STRERRORSIZE];
601         int on = 1;
602
603         if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
604                 return;
605
606         if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
607                        (void *)&on, sizeof(on)) < 0) {
608
609                 UNEXPECTED_ERROR(__FILE__, __LINE__,
610                                  "setsockopt(%d, IPV6_RECVPKTINFO) "
611                                  "%s: %s", sock->fd,
612                                  isc_msgcat_get(isc_msgcat,
613                                                 ISC_MSGSET_GENERAL,
614                                                 ISC_MSG_FAILED,
615                                                 "failed"),
616                                  strbuf);
617         }
618 }
619 #else
620 #define FIX_IPV6_RECVPKTINFO(sock) (void)0
621 #endif
622
623 /*%
624  * Increment socket-related statistics counters.
625  */
626 static inline void
627 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
628         REQUIRE(counterid != -1);
629
630         if (stats != NULL)
631                 isc_stats_increment(stats, counterid);
632 }
633
634 static inline isc_result_t
635 watch_fd(isc_socketmgr_t *manager, int fd, int msg) {
636         isc_result_t result = ISC_R_SUCCESS;
637
638 #ifdef USE_KQUEUE
639         struct kevent evchange;
640
641         memset(&evchange, 0, sizeof(evchange));
642         if (msg == SELECT_POKE_READ)
643                 evchange.filter = EVFILT_READ;
644         else
645                 evchange.filter = EVFILT_WRITE;
646         evchange.flags = EV_ADD;
647         evchange.ident = fd;
648         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
649                 result = isc__errno2result(errno);
650
651         return (result);
652 #elif defined(USE_EPOLL)
653         struct epoll_event event;
654
655         if (msg == SELECT_POKE_READ)
656                 event.events = EPOLLIN;
657         else
658                 event.events = EPOLLOUT;
659         memset(&event.data, 0, sizeof(event.data));
660         event.data.fd = fd;
661         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
662             errno != EEXIST) {
663                 result = isc__errno2result(errno);
664         }
665
666         return (result);
667 #elif defined(USE_DEVPOLL)
668         struct pollfd pfd;
669         int lockid = FDLOCK_ID(fd);
670
671         memset(&pfd, 0, sizeof(pfd));
672         if (msg == SELECT_POKE_READ)
673                 pfd.events = POLLIN;
674         else
675                 pfd.events = POLLOUT;
676         pfd.fd = fd;
677         pfd.revents = 0;
678         LOCK(&manager->fdlock[lockid]);
679         if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
680                 result = isc__errno2result(errno);
681         else {
682                 if (msg == SELECT_POKE_READ)
683                         manager->fdpollinfo[fd].want_read = 1;
684                 else
685                         manager->fdpollinfo[fd].want_write = 1;
686         }
687         UNLOCK(&manager->fdlock[lockid]);
688
689         return (result);
690 #elif defined(USE_SELECT)
691         LOCK(&manager->lock);
692         if (msg == SELECT_POKE_READ)
693                 FD_SET(fd, manager->read_fds);
694         if (msg == SELECT_POKE_WRITE)
695                 FD_SET(fd, manager->write_fds);
696         UNLOCK(&manager->lock);
697
698         return (result);
699 #endif
700 }
701
702 static inline isc_result_t
703 unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) {
704         isc_result_t result = ISC_R_SUCCESS;
705
706 #ifdef USE_KQUEUE
707         struct kevent evchange;
708
709         memset(&evchange, 0, sizeof(evchange));
710         if (msg == SELECT_POKE_READ)
711                 evchange.filter = EVFILT_READ;
712         else
713                 evchange.filter = EVFILT_WRITE;
714         evchange.flags = EV_DELETE;
715         evchange.ident = fd;
716         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
717                 result = isc__errno2result(errno);
718
719         return (result);
720 #elif defined(USE_EPOLL)
721         struct epoll_event event;
722
723         if (msg == SELECT_POKE_READ)
724                 event.events = EPOLLIN;
725         else
726                 event.events = EPOLLOUT;
727         memset(&event.data, 0, sizeof(event.data));
728         event.data.fd = fd;
729         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
730             errno != ENOENT) {
731                 char strbuf[ISC_STRERRORSIZE];
732                 isc__strerror(errno, strbuf, sizeof(strbuf));
733                 UNEXPECTED_ERROR(__FILE__, __LINE__,
734                                  "epoll_ctl(DEL), %d: %s", fd, strbuf);
735                 result = ISC_R_UNEXPECTED;
736         }
737         return (result);
738 #elif defined(USE_DEVPOLL)
739         struct pollfd pfds[2];
740         size_t writelen = sizeof(pfds[0]);
741         int lockid = FDLOCK_ID(fd);
742
743         memset(pfds, 0, sizeof(pfds));
744         pfds[0].events = POLLREMOVE;
745         pfds[0].fd = fd;
746
747         /*
748          * Canceling read or write polling via /dev/poll is tricky.  Since it
749          * only provides a way of canceling per FD, we may need to re-poll the
750          * socket for the other operation.
751          */
752         LOCK(&manager->fdlock[lockid]);
753         if (msg == SELECT_POKE_READ &&
754             manager->fdpollinfo[fd].want_write == 1) {
755                 pfds[1].events = POLLOUT;
756                 pfds[1].fd = fd;
757                 writelen += sizeof(pfds[1]);
758         }
759         if (msg == SELECT_POKE_WRITE &&
760             manager->fdpollinfo[fd].want_read == 1) {
761                 pfds[1].events = POLLIN;
762                 pfds[1].fd = fd;
763                 writelen += sizeof(pfds[1]);
764         }
765
766         if (write(manager->devpoll_fd, pfds, writelen) == -1)
767                 result = isc__errno2result(errno);
768         else {
769                 if (msg == SELECT_POKE_READ)
770                         manager->fdpollinfo[fd].want_read = 0;
771                 else
772                         manager->fdpollinfo[fd].want_write = 0;
773         }
774         UNLOCK(&manager->fdlock[lockid]);
775
776         return (result);
777 #elif defined(USE_SELECT)
778         LOCK(&manager->lock);
779         if (msg == SELECT_POKE_READ)
780                 FD_CLR(fd, manager->read_fds);
781         else if (msg == SELECT_POKE_WRITE)
782                 FD_CLR(fd, manager->write_fds);
783         UNLOCK(&manager->lock);
784
785         return (result);
786 #endif
787 }
788
789 static void
790 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
791         isc_result_t result;
792         int lockid = FDLOCK_ID(fd);
793
794         /*
795          * This is a wakeup on a socket.  If the socket is not in the
796          * process of being closed, start watching it for either reads
797          * or writes.
798          */
799
800         INSIST(fd >= 0 && fd < (int)manager->maxsocks);
801
802         if (msg == SELECT_POKE_CLOSE) {
803                 /* No one should be updating fdstate, so no need to lock it */
804                 INSIST(manager->fdstate[fd] == CLOSE_PENDING);
805                 manager->fdstate[fd] = CLOSED;
806                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
807                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
808                 (void)close(fd);
809                 return;
810         }
811
812         LOCK(&manager->fdlock[lockid]);
813         if (manager->fdstate[fd] == CLOSE_PENDING) {
814                 UNLOCK(&manager->fdlock[lockid]);
815
816                 /*
817                  * We accept (and ignore) any error from unwatch_fd() as we are
818                  * closing the socket, hoping it doesn't leave dangling state in
819                  * the kernel.
820                  * Note that unwatch_fd() must be called after releasing the
821                  * fdlock; otherwise it could cause deadlock due to a lock order
822                  * reversal.
823                  */
824                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
825                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
826                 return;
827         }
828         if (manager->fdstate[fd] != MANAGED) {
829                 UNLOCK(&manager->fdlock[lockid]);
830                 return;
831         }
832         UNLOCK(&manager->fdlock[lockid]);
833
834         /*
835          * Set requested bit.
836          */
837         result = watch_fd(manager, fd, msg);
838         if (result != ISC_R_SUCCESS) {
839                 /*
840                  * XXXJT: what should we do?  Ignoring the failure of watching
841                  * a socket will make the application dysfunctional, but there
842                  * seems to be no reasonable recovery process.
843                  */
844                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
845                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
846                               "failed to start watching FD (%d): %s",
847                               fd, isc_result_totext(result));
848         }
849 }
850
851 #ifdef ISC_PLATFORM_USETHREADS
852 /*
853  * Poke the select loop when there is something for us to do.
854  * The write is required (by POSIX) to complete.  That is, we
855  * will not get partial writes.
856  */
857 static void
858 select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
859         int cc;
860         int buf[2];
861         char strbuf[ISC_STRERRORSIZE];
862
863         buf[0] = fd;
864         buf[1] = msg;
865
866         do {
867                 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
868 #ifdef ENOSR
869                 /*
870                  * Treat ENOSR as EAGAIN but loop slowly as it is
871                  * unlikely to clear fast.
872                  */
873                 if (cc < 0 && errno == ENOSR) {
874                         sleep(1);
875                         errno = EAGAIN;
876                 }
877 #endif
878         } while (cc < 0 && SOFT_ERROR(errno));
879
880         if (cc < 0) {
881                 isc__strerror(errno, strbuf, sizeof(strbuf));
882                 FATAL_ERROR(__FILE__, __LINE__,
883                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
884                                            ISC_MSG_WRITEFAILED,
885                                            "write() failed "
886                                            "during watcher poke: %s"),
887                             strbuf);
888         }
889
890         INSIST(cc == sizeof(buf));
891 }
892
893 /*
894  * Read a message on the internal fd.
895  */
896 static void
897 select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
898         int buf[2];
899         int cc;
900         char strbuf[ISC_STRERRORSIZE];
901
902         cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
903         if (cc < 0) {
904                 *msg = SELECT_POKE_NOTHING;
905                 *fd = -1;       /* Silence compiler. */
906                 if (SOFT_ERROR(errno))
907                         return;
908
909                 isc__strerror(errno, strbuf, sizeof(strbuf));
910                 FATAL_ERROR(__FILE__, __LINE__,
911                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
912                                            ISC_MSG_READFAILED,
913                                            "read() failed "
914                                            "during watcher poke: %s"),
915                             strbuf);
916
917                 return;
918         }
919         INSIST(cc == sizeof(buf));
920
921         *fd = buf[0];
922         *msg = buf[1];
923 }
924 #else /* ISC_PLATFORM_USETHREADS */
925 /*
926  * Update the state of the socketmgr when something changes.
927  */
928 static void
929 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
930         if (msg == SELECT_POKE_SHUTDOWN)
931                 return;
932         else if (fd >= 0)
933                 wakeup_socket(manager, fd, msg);
934         return;
935 }
936 #endif /* ISC_PLATFORM_USETHREADS */
937
938 /*
939  * Make a fd non-blocking.
940  */
941 static isc_result_t
942 make_nonblock(int fd) {
943         int ret;
944         int flags;
945         char strbuf[ISC_STRERRORSIZE];
946 #ifdef USE_FIONBIO_IOCTL
947         int on = 1;
948
949         ret = ioctl(fd, FIONBIO, (char *)&on);
950 #else
951         flags = fcntl(fd, F_GETFL, 0);
952         flags |= PORT_NONBLOCK;
953         ret = fcntl(fd, F_SETFL, flags);
954 #endif
955
956         if (ret == -1) {
957                 isc__strerror(errno, strbuf, sizeof(strbuf));
958                 UNEXPECTED_ERROR(__FILE__, __LINE__,
959 #ifdef USE_FIONBIO_IOCTL
960                                  "ioctl(%d, FIONBIO, &on): %s", fd,
961 #else
962                                  "fcntl(%d, F_SETFL, %d): %s", fd, flags,
963 #endif
964                                  strbuf);
965
966                 return (ISC_R_UNEXPECTED);
967         }
968
969         return (ISC_R_SUCCESS);
970 }
971
972 #ifdef USE_CMSG
973 /*
974  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
975  * In order to ensure as much portability as possible, we provide wrapper
976  * functions of these macros.
977  * Note that cmsg_space() could run slow on OSes that do not have
978  * CMSG_SPACE.
979  */
980 static inline ISC_SOCKADDR_LEN_T
981 cmsg_len(ISC_SOCKADDR_LEN_T len) {
982 #ifdef CMSG_LEN
983         return (CMSG_LEN(len));
984 #else
985         ISC_SOCKADDR_LEN_T hdrlen;
986
987         /*
988          * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
989          * is correct.
990          */
991         hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
992         return (hdrlen + len);
993 #endif
994 }
995
996 static inline ISC_SOCKADDR_LEN_T
997 cmsg_space(ISC_SOCKADDR_LEN_T len) {
998 #ifdef CMSG_SPACE
999         return (CMSG_SPACE(len));
1000 #else
1001         struct msghdr msg;
1002         struct cmsghdr *cmsgp;
1003         /*
1004          * XXX: The buffer length is an ad-hoc value, but should be enough
1005          * in a practical sense.
1006          */
1007         char dummybuf[sizeof(struct cmsghdr) + 1024];
1008
1009         memset(&msg, 0, sizeof(msg));
1010         msg.msg_control = dummybuf;
1011         msg.msg_controllen = sizeof(dummybuf);
1012
1013         cmsgp = (struct cmsghdr *)dummybuf;
1014         cmsgp->cmsg_len = cmsg_len(len);
1015
1016         cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1017         if (cmsgp != NULL)
1018                 return ((char *)cmsgp - (char *)msg.msg_control);
1019         else
1020                 return (0);
1021 #endif
1022 }
1023 #endif /* USE_CMSG */
1024
1025 /*
1026  * Process control messages received on a socket.
1027  */
1028 static void
1029 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1030 #ifdef USE_CMSG
1031         struct cmsghdr *cmsgp;
1032 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1033         struct in6_pktinfo *pktinfop;
1034 #endif
1035 #ifdef SO_TIMESTAMP
1036         struct timeval *timevalp;
1037 #endif
1038 #endif
1039
1040         /*
1041          * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1042          * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1043          * They are all here, outside of the CPP tests, because it is
1044          * more consistent with the usual ISC coding style.
1045          */
1046         UNUSED(sock);
1047         UNUSED(msg);
1048         UNUSED(dev);
1049
1050 #ifdef ISC_NET_BSD44MSGHDR
1051
1052 #ifdef MSG_TRUNC
1053         if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
1054                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1055 #endif
1056
1057 #ifdef MSG_CTRUNC
1058         if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
1059                 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1060 #endif
1061
1062 #ifndef USE_CMSG
1063         return;
1064 #else
1065         if (msg->msg_controllen == 0U || msg->msg_control == NULL)
1066                 return;
1067
1068 #ifdef SO_TIMESTAMP
1069         timevalp = NULL;
1070 #endif
1071 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1072         pktinfop = NULL;
1073 #endif
1074
1075         cmsgp = CMSG_FIRSTHDR(msg);
1076         while (cmsgp != NULL) {
1077                 socket_log(sock, NULL, TRACE,
1078                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
1079                            "processing cmsg %p", cmsgp);
1080
1081 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1082                 if (cmsgp->cmsg_level == IPPROTO_IPV6
1083                     && cmsgp->cmsg_type == IPV6_PKTINFO) {
1084
1085                         pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1086                         memcpy(&dev->pktinfo, pktinfop,
1087                                sizeof(struct in6_pktinfo));
1088                         dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1089                         socket_log(sock, NULL, TRACE,
1090                                    isc_msgcat, ISC_MSGSET_SOCKET,
1091                                    ISC_MSG_IFRECEIVED,
1092                                    "interface received on ifindex %u",
1093                                    dev->pktinfo.ipi6_ifindex);
1094                         if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
1095                                 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1096                         goto next;
1097                 }
1098 #endif
1099
1100 #ifdef SO_TIMESTAMP
1101                 if (cmsgp->cmsg_level == SOL_SOCKET
1102                     && cmsgp->cmsg_type == SCM_TIMESTAMP) {
1103                         timevalp = (struct timeval *)CMSG_DATA(cmsgp);
1104                         dev->timestamp.seconds = timevalp->tv_sec;
1105                         dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
1106                         dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1107                         goto next;
1108                 }
1109 #endif
1110
1111         next:
1112                 cmsgp = CMSG_NXTHDR(msg, cmsgp);
1113         }
1114 #endif /* USE_CMSG */
1115
1116 #endif /* ISC_NET_BSD44MSGHDR */
1117 }
1118
1119 /*
1120  * Construct an iov array and attach it to the msghdr passed in.  This is
1121  * the SEND constructor, which will use the used region of the buffer
1122  * (if using a buffer list) or will use the internal region (if a single
1123  * buffer I/O is requested).
1124  *
1125  * Nothing can be NULL, and the done event must list at least one buffer
1126  * on the buffer linked list for this function to be meaningful.
1127  *
1128  * If write_countp != NULL, *write_countp will hold the number of bytes
1129  * this transaction can send.
1130  */
1131 static void
1132 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
1133                   struct msghdr *msg, struct iovec *iov, size_t *write_countp)
1134 {
1135         unsigned int iovcount;
1136         isc_buffer_t *buffer;
1137         isc_region_t used;
1138         size_t write_count;
1139         size_t skip_count;
1140
1141         memset(msg, 0, sizeof(*msg));
1142
1143         if (!sock->connected) {
1144                 msg->msg_name = (void *)&dev->address.type.sa;
1145                 msg->msg_namelen = dev->address.length;
1146         } else {
1147                 msg->msg_name = NULL;
1148                 msg->msg_namelen = 0;
1149         }
1150
1151         buffer = ISC_LIST_HEAD(dev->bufferlist);
1152         write_count = 0;
1153         iovcount = 0;
1154
1155         /*
1156          * Single buffer I/O?  Skip what we've done so far in this region.
1157          */
1158         if (buffer == NULL) {
1159                 write_count = dev->region.length - dev->n;
1160                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1161                 iov[0].iov_len = write_count;
1162                 iovcount = 1;
1163
1164                 goto config;
1165         }
1166
1167         /*
1168          * Multibuffer I/O.
1169          * Skip the data in the buffer list that we have already written.
1170          */
1171         skip_count = dev->n;
1172         while (buffer != NULL) {
1173                 REQUIRE(ISC_BUFFER_VALID(buffer));
1174                 if (skip_count < isc_buffer_usedlength(buffer))
1175                         break;
1176                 skip_count -= isc_buffer_usedlength(buffer);
1177                 buffer = ISC_LIST_NEXT(buffer, link);
1178         }
1179
1180         while (buffer != NULL) {
1181                 INSIST(iovcount < MAXSCATTERGATHER_SEND);
1182
1183                 isc_buffer_usedregion(buffer, &used);
1184
1185                 if (used.length > 0) {
1186                         iov[iovcount].iov_base = (void *)(used.base
1187                                                           + skip_count);
1188                         iov[iovcount].iov_len = used.length - skip_count;
1189                         write_count += (used.length - skip_count);
1190                         skip_count = 0;
1191                         iovcount++;
1192                 }
1193                 buffer = ISC_LIST_NEXT(buffer, link);
1194         }
1195
1196         INSIST(skip_count == 0U);
1197
1198  config:
1199         msg->msg_iov = iov;
1200         msg->msg_iovlen = iovcount;
1201
1202 #ifdef ISC_NET_BSD44MSGHDR
1203         msg->msg_control = NULL;
1204         msg->msg_controllen = 0;
1205         msg->msg_flags = 0;
1206 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1207         if ((sock->type == isc_sockettype_udp)
1208             && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
1209                 struct cmsghdr *cmsgp;
1210                 struct in6_pktinfo *pktinfop;
1211
1212                 socket_log(sock, NULL, TRACE,
1213                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
1214                            "sendto pktinfo data, ifindex %u",
1215                            dev->pktinfo.ipi6_ifindex);
1216
1217                 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1218                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1219                 msg->msg_control = (void *)sock->sendcmsgbuf;
1220
1221                 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
1222                 cmsgp->cmsg_level = IPPROTO_IPV6;
1223                 cmsgp->cmsg_type = IPV6_PKTINFO;
1224                 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1225                 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1226                 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1227         }
1228 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
1229 #else /* ISC_NET_BSD44MSGHDR */
1230         msg->msg_accrights = NULL;
1231         msg->msg_accrightslen = 0;
1232 #endif /* ISC_NET_BSD44MSGHDR */
1233
1234         if (write_countp != NULL)
1235                 *write_countp = write_count;
1236 }
1237
1238 /*
1239  * Construct an iov array and attach it to the msghdr passed in.  This is
1240  * the RECV constructor, which will use the available region of the buffer
1241  * (if using a buffer list) or will use the internal region (if a single
1242  * buffer I/O is requested).
1243  *
1244  * Nothing can be NULL, and the done event must list at least one buffer
1245  * on the buffer linked list for this function to be meaningful.
1246  *
1247  * If read_countp != NULL, *read_countp will hold the number of bytes
1248  * this transaction can receive.
1249  */
1250 static void
1251 build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
1252                   struct msghdr *msg, struct iovec *iov, size_t *read_countp)
1253 {
1254         unsigned int iovcount;
1255         isc_buffer_t *buffer;
1256         isc_region_t available;
1257         size_t read_count;
1258
1259         memset(msg, 0, sizeof(struct msghdr));
1260
1261         if (sock->type == isc_sockettype_udp) {
1262                 memset(&dev->address, 0, sizeof(dev->address));
1263 #ifdef BROKEN_RECVMSG
1264                 if (sock->pf == AF_INET) {
1265                         msg->msg_name = (void *)&dev->address.type.sin;
1266                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1267                 } else if (sock->pf == AF_INET6) {
1268                         msg->msg_name = (void *)&dev->address.type.sin6;
1269                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1270 #ifdef ISC_PLATFORM_HAVESYSUNH
1271                 } else if (sock->pf == AF_UNIX) {
1272                         msg->msg_name = (void *)&dev->address.type.sunix;
1273                         msg->msg_namelen = sizeof(dev->address.type.sunix);
1274 #endif
1275                 } else {
1276                         msg->msg_name = (void *)&dev->address.type.sa;
1277                         msg->msg_namelen = sizeof(dev->address.type);
1278                 }
1279 #else
1280                 msg->msg_name = (void *)&dev->address.type.sa;
1281                 msg->msg_namelen = sizeof(dev->address.type);
1282 #endif
1283 #ifdef ISC_NET_RECVOVERFLOW
1284                 /* If needed, steal one iovec for overflow detection. */
1285                 maxiov--;
1286 #endif
1287         } else { /* TCP */
1288                 msg->msg_name = NULL;
1289                 msg->msg_namelen = 0;
1290                 dev->address = sock->peer_address;
1291         }
1292
1293         buffer = ISC_LIST_HEAD(dev->bufferlist);
1294         read_count = 0;
1295
1296         /*
1297          * Single buffer I/O?  Skip what we've done so far in this region.
1298          */
1299         if (buffer == NULL) {
1300                 read_count = dev->region.length - dev->n;
1301                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1302                 iov[0].iov_len = read_count;
1303                 iovcount = 1;
1304
1305                 goto config;
1306         }
1307
1308         /*
1309          * Multibuffer I/O.
1310          * Skip empty buffers.
1311          */
1312         while (buffer != NULL) {
1313                 REQUIRE(ISC_BUFFER_VALID(buffer));
1314                 if (isc_buffer_availablelength(buffer) != 0)
1315                         break;
1316                 buffer = ISC_LIST_NEXT(buffer, link);
1317         }
1318
1319         iovcount = 0;
1320         while (buffer != NULL) {
1321                 INSIST(iovcount < MAXSCATTERGATHER_RECV);
1322
1323                 isc_buffer_availableregion(buffer, &available);
1324
1325                 if (available.length > 0) {
1326                         iov[iovcount].iov_base = (void *)(available.base);
1327                         iov[iovcount].iov_len = available.length;
1328                         read_count += available.length;
1329                         iovcount++;
1330                 }
1331                 buffer = ISC_LIST_NEXT(buffer, link);
1332         }
1333
1334  config:
1335
1336         /*
1337          * If needed, set up to receive that one extra byte.  Note that
1338          * we know there is at least one iov left, since we stole it
1339          * at the top of this function.
1340          */
1341 #ifdef ISC_NET_RECVOVERFLOW
1342         if (sock->type == isc_sockettype_udp) {
1343                 iov[iovcount].iov_base = (void *)(&sock->overflow);
1344                 iov[iovcount].iov_len = 1;
1345                 iovcount++;
1346         }
1347 #endif
1348
1349         msg->msg_iov = iov;
1350         msg->msg_iovlen = iovcount;
1351
1352 #ifdef ISC_NET_BSD44MSGHDR
1353         msg->msg_control = NULL;
1354         msg->msg_controllen = 0;
1355         msg->msg_flags = 0;
1356 #if defined(USE_CMSG)
1357         if (sock->type == isc_sockettype_udp) {
1358                 msg->msg_control = sock->recvcmsgbuf;
1359                 msg->msg_controllen = sock->recvcmsgbuflen;
1360         }
1361 #endif /* USE_CMSG */
1362 #else /* ISC_NET_BSD44MSGHDR */
1363         msg->msg_accrights = NULL;
1364         msg->msg_accrightslen = 0;
1365 #endif /* ISC_NET_BSD44MSGHDR */
1366
1367         if (read_countp != NULL)
1368                 *read_countp = read_count;
1369 }
1370
1371 static void
1372 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1373                 isc_socketevent_t *dev)
1374 {
1375         if (sock->type == isc_sockettype_udp) {
1376                 if (address != NULL)
1377                         dev->address = *address;
1378                 else
1379                         dev->address = sock->peer_address;
1380         } else if (sock->type == isc_sockettype_tcp) {
1381                 INSIST(address == NULL);
1382                 dev->address = sock->peer_address;
1383         }
1384 }
1385
1386 static void
1387 destroy_socketevent(isc_event_t *event) {
1388         isc_socketevent_t *ev = (isc_socketevent_t *)event;
1389
1390         INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1391
1392         (ev->destroy)(event);
1393 }
1394
1395 static isc_socketevent_t *
1396 allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1397                      isc_taskaction_t action, const void *arg)
1398 {
1399         isc_socketevent_t *ev;
1400
1401         ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1402                                                      sock, eventtype,
1403                                                      action, arg,
1404                                                      sizeof(*ev));
1405
1406         if (ev == NULL)
1407                 return (NULL);
1408
1409         ev->result = ISC_R_UNEXPECTED;
1410         ISC_LINK_INIT(ev, ev_link);
1411         ISC_LIST_INIT(ev->bufferlist);
1412         ev->region.base = NULL;
1413         ev->n = 0;
1414         ev->offset = 0;
1415         ev->attributes = 0;
1416         ev->destroy = ev->ev_destroy;
1417         ev->ev_destroy = destroy_socketevent;
1418
1419         return (ev);
1420 }
1421
1422 #if defined(ISC_SOCKET_DEBUG)
1423 static void
1424 dump_msg(struct msghdr *msg) {
1425         unsigned int i;
1426
1427         printf("MSGHDR %p\n", msg);
1428         printf("\tname %p, namelen %ld\n", msg->msg_name,
1429                (long) msg->msg_namelen);
1430         printf("\tiov %p, iovlen %ld\n", msg->msg_iov,
1431                (long) msg->msg_iovlen);
1432         for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1433                 printf("\t\t%d\tbase %p, len %ld\n", i,
1434                        msg->msg_iov[i].iov_base,
1435                        (long) msg->msg_iov[i].iov_len);
1436 #ifdef ISC_NET_BSD44MSGHDR
1437         printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1438                (long) msg->msg_controllen);
1439 #endif
1440 }
1441 #endif
1442
1443 #define DOIO_SUCCESS            0       /* i/o ok, event sent */
1444 #define DOIO_SOFT               1       /* i/o ok, soft error, no event sent */
1445 #define DOIO_HARD               2       /* i/o error, event sent */
1446 #define DOIO_EOF                3       /* EOF, no event sent */
1447
1448 static int
1449 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1450         int cc;
1451         struct iovec iov[MAXSCATTERGATHER_RECV];
1452         size_t read_count;
1453         size_t actual_count;
1454         struct msghdr msghdr;
1455         isc_buffer_t *buffer;
1456         int recv_errno;
1457         char strbuf[ISC_STRERRORSIZE];
1458
1459         build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
1460
1461 #if defined(ISC_SOCKET_DEBUG)
1462         dump_msg(&msghdr);
1463 #endif
1464
1465         cc = recvmsg(sock->fd, &msghdr, 0);
1466         recv_errno = errno;
1467
1468 #if defined(ISC_SOCKET_DEBUG)
1469         dump_msg(&msghdr);
1470 #endif
1471
1472         if (cc < 0) {
1473                 if (SOFT_ERROR(recv_errno))
1474                         return (DOIO_SOFT);
1475
1476                 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1477                         isc__strerror(recv_errno, strbuf, sizeof(strbuf));
1478                         socket_log(sock, NULL, IOEVENT,
1479                                    isc_msgcat, ISC_MSGSET_SOCKET,
1480                                    ISC_MSG_DOIORECV,
1481                                   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1482                                    sock->fd, cc, recv_errno, strbuf);
1483                 }
1484
1485 #define SOFT_OR_HARD(_system, _isc) \
1486         if (recv_errno == _system) { \
1487                 if (sock->connected) { \
1488                         dev->result = _isc; \
1489                         inc_stats(sock->manager->stats, \
1490                                   sock->statsindex[STATID_RECVFAIL]); \
1491                         return (DOIO_HARD); \
1492                 } \
1493                 return (DOIO_SOFT); \
1494         }
1495 #define ALWAYS_HARD(_system, _isc) \
1496         if (recv_errno == _system) { \
1497                 dev->result = _isc; \
1498                 inc_stats(sock->manager->stats, \
1499                           sock->statsindex[STATID_RECVFAIL]); \
1500                 return (DOIO_HARD); \
1501         }
1502
1503                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1504                 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1505                 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1506                 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1507                 /* HPUX 11.11 can return EADDRNOTAVAIL. */
1508                 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1509                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1510                 /*
1511                  * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
1512                  * errors.
1513                  */
1514 #ifdef EPROTO
1515                 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1516 #endif
1517                 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1518
1519 #undef SOFT_OR_HARD
1520 #undef ALWAYS_HARD
1521
1522                 dev->result = isc__errno2result(recv_errno);
1523                 inc_stats(sock->manager->stats,
1524                           sock->statsindex[STATID_RECVFAIL]);
1525                 return (DOIO_HARD);
1526         }
1527
1528         /*
1529          * On TCP and UNIX sockets, zero length reads indicate EOF,
1530          * while on UDP sockets, zero length reads are perfectly valid,
1531          * although strange.
1532          */
1533         switch (sock->type) {
1534         case isc_sockettype_tcp:
1535         case isc_sockettype_unix:
1536                 if (cc == 0)
1537                         return (DOIO_EOF);
1538                 break;
1539         case isc_sockettype_udp:
1540                 break;
1541         case isc_sockettype_fdwatch:
1542         default:
1543                 INSIST(0);
1544         }
1545
1546         if (sock->type == isc_sockettype_udp) {
1547                 dev->address.length = msghdr.msg_namelen;
1548                 if (isc_sockaddr_getport(&dev->address) == 0) {
1549                         if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1550                                 socket_log(sock, &dev->address, IOEVENT,
1551                                            isc_msgcat, ISC_MSGSET_SOCKET,
1552                                            ISC_MSG_ZEROPORT,
1553                                            "dropping source port zero packet");
1554                         }
1555                         return (DOIO_SOFT);
1556                 }
1557         }
1558
1559         socket_log(sock, &dev->address, IOEVENT,
1560                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1561                    "packet received correctly");
1562
1563         /*
1564          * Overflow bit detection.  If we received MORE bytes than we should,
1565          * this indicates an overflow situation.  Set the flag in the
1566          * dev entry and adjust how much we read by one.
1567          */
1568 #ifdef ISC_NET_RECVOVERFLOW
1569         if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1570                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1571                 cc--;
1572         }
1573 #endif
1574
1575         /*
1576          * If there are control messages attached, run through them and pull
1577          * out the interesting bits.
1578          */
1579         if (sock->type == isc_sockettype_udp)
1580                 process_cmsg(sock, &msghdr, dev);
1581
1582         /*
1583          * update the buffers (if any) and the i/o count
1584          */
1585         dev->n += cc;
1586         actual_count = cc;
1587         buffer = ISC_LIST_HEAD(dev->bufferlist);
1588         while (buffer != NULL && actual_count > 0U) {
1589                 REQUIRE(ISC_BUFFER_VALID(buffer));
1590                 if (isc_buffer_availablelength(buffer) <= actual_count) {
1591                         actual_count -= isc_buffer_availablelength(buffer);
1592                         isc_buffer_add(buffer,
1593                                        isc_buffer_availablelength(buffer));
1594                 } else {
1595                         isc_buffer_add(buffer, actual_count);
1596                         actual_count = 0;
1597                         break;
1598                 }
1599                 buffer = ISC_LIST_NEXT(buffer, link);
1600                 if (buffer == NULL) {
1601                         INSIST(actual_count == 0U);
1602                 }
1603         }
1604
1605         /*
1606          * If we read less than we expected, update counters,
1607          * and let the upper layer poke the descriptor.
1608          */
1609         if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1610                 return (DOIO_SOFT);
1611
1612         /*
1613          * Full reads are posted, or partials if partials are ok.
1614          */
1615         dev->result = ISC_R_SUCCESS;
1616         return (DOIO_SUCCESS);
1617 }
1618
1619 /*
1620  * Returns:
1621  *      DOIO_SUCCESS    The operation succeeded.  dev->result contains
1622  *                      ISC_R_SUCCESS.
1623  *
1624  *      DOIO_HARD       A hard or unexpected I/O error was encountered.
1625  *                      dev->result contains the appropriate error.
1626  *
1627  *      DOIO_SOFT       A soft I/O error was encountered.  No senddone
1628  *                      event was sent.  The operation should be retried.
1629  *
1630  *      No other return values are possible.
1631  */
1632 static int
1633 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1634         int cc;
1635         struct iovec iov[MAXSCATTERGATHER_SEND];
1636         size_t write_count;
1637         struct msghdr msghdr;
1638         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1639         int attempts = 0;
1640         int send_errno;
1641         char strbuf[ISC_STRERRORSIZE];
1642
1643         build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1644
1645  resend:
1646         cc = sendmsg(sock->fd, &msghdr, 0);
1647         send_errno = errno;
1648
1649         /*
1650          * Check for error or block condition.
1651          */
1652         if (cc < 0) {
1653                 if (send_errno == EINTR && ++attempts < NRETRIES)
1654                         goto resend;
1655
1656                 if (SOFT_ERROR(send_errno))
1657                         return (DOIO_SOFT);
1658
1659 #define SOFT_OR_HARD(_system, _isc) \
1660         if (send_errno == _system) { \
1661                 if (sock->connected) { \
1662                         dev->result = _isc; \
1663                         inc_stats(sock->manager->stats, \
1664                                   sock->statsindex[STATID_SENDFAIL]); \
1665                         return (DOIO_HARD); \
1666                 } \
1667                 return (DOIO_SOFT); \
1668         }
1669 #define ALWAYS_HARD(_system, _isc) \
1670         if (send_errno == _system) { \
1671                 dev->result = _isc; \
1672                 inc_stats(sock->manager->stats, \
1673                           sock->statsindex[STATID_SENDFAIL]); \
1674                 return (DOIO_HARD); \
1675         }
1676
1677                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1678                 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1679                 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1680                 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1681                 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1682 #ifdef EHOSTDOWN
1683                 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1684 #endif
1685                 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1686                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1687                 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1688                 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1689                 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1690
1691 #undef SOFT_OR_HARD
1692 #undef ALWAYS_HARD
1693
1694                 /*
1695                  * The other error types depend on whether or not the
1696                  * socket is UDP or TCP.  If it is UDP, some errors
1697                  * that we expect to be fatal under TCP are merely
1698                  * annoying, and are really soft errors.
1699                  *
1700                  * However, these soft errors are still returned as
1701                  * a status.
1702                  */
1703                 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1704                 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1705                 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1706                                  addrbuf, strbuf);
1707                 dev->result = isc__errno2result(send_errno);
1708                 inc_stats(sock->manager->stats,
1709                           sock->statsindex[STATID_SENDFAIL]);
1710                 return (DOIO_HARD);
1711         }
1712
1713         if (cc == 0) {
1714                 inc_stats(sock->manager->stats,
1715                           sock->statsindex[STATID_SENDFAIL]);
1716                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1717                                  "doio_send: send() %s 0",
1718                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1719                                                 ISC_MSG_RETURNED, "returned"));
1720         }
1721
1722         /*
1723          * If we write less than we expected, update counters, poke.
1724          */
1725         dev->n += cc;
1726         if ((size_t)cc != write_count)
1727                 return (DOIO_SOFT);
1728
1729         /*
1730          * Exactly what we wanted to write.  We're done with this
1731          * entry.  Post its completion event.
1732          */
1733         dev->result = ISC_R_SUCCESS;
1734         return (DOIO_SUCCESS);
1735 }
1736
1737 /*
1738  * Kill.
1739  *
1740  * Caller must ensure that the socket is not locked and no external
1741  * references exist.
1742  */
1743 static void
1744 closesocket(isc_socketmgr_t *manager, isc_socket_t *sock, int fd) {
1745         isc_sockettype_t type = sock->type;
1746         int lockid = FDLOCK_ID(fd);
1747
1748         /*
1749          * No one has this socket open, so the watcher doesn't have to be
1750          * poked, and the socket doesn't have to be locked.
1751          */
1752         LOCK(&manager->fdlock[lockid]);
1753         manager->fds[fd] = NULL;
1754         if (type == isc_sockettype_fdwatch)
1755                 manager->fdstate[fd] = CLOSED;
1756         else
1757                 manager->fdstate[fd] = CLOSE_PENDING;
1758         UNLOCK(&manager->fdlock[lockid]);
1759         if (type == isc_sockettype_fdwatch) {
1760                 /*
1761                  * The caller may close the socket once this function returns,
1762                  * and `fd' may be reassigned for a new socket.  So we do
1763                  * unwatch_fd() here, rather than defer it via select_poke().
1764                  * Note: this may complicate data protection among threads and
1765                  * may reduce performance due to additional locks.  One way to
1766                  * solve this would be to dup() the watched descriptor, but we
1767                  * take a simpler approach at this moment.
1768                  */
1769                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1770                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1771         } else
1772                 select_poke(manager, fd, SELECT_POKE_CLOSE);
1773
1774         inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]);
1775
1776         /*
1777          * update manager->maxfd here (XXX: this should be implemented more
1778          * efficiently)
1779          */
1780 #ifdef USE_SELECT
1781         LOCK(&manager->lock);
1782         if (manager->maxfd == fd) {
1783                 int i;
1784
1785                 manager->maxfd = 0;
1786                 for (i = fd - 1; i >= 0; i--) {
1787                         lockid = FDLOCK_ID(i);
1788
1789                         LOCK(&manager->fdlock[lockid]);
1790                         if (manager->fdstate[i] == MANAGED) {
1791                                 manager->maxfd = i;
1792                                 UNLOCK(&manager->fdlock[lockid]);
1793                                 break;
1794                         }
1795                         UNLOCK(&manager->fdlock[lockid]);
1796                 }
1797 #ifdef ISC_PLATFORM_USETHREADS
1798                 if (manager->maxfd < manager->pipe_fds[0])
1799                         manager->maxfd = manager->pipe_fds[0];
1800 #endif
1801         }
1802         UNLOCK(&manager->lock);
1803 #endif  /* USE_SELECT */
1804 }
1805
1806 static void
1807 destroy(isc_socket_t **sockp) {
1808         int fd;
1809         isc_socket_t *sock = *sockp;
1810         isc_socketmgr_t *manager = sock->manager;
1811
1812         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1813                    ISC_MSG_DESTROYING, "destroying");
1814
1815         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1816         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1817         INSIST(ISC_LIST_EMPTY(sock->send_list));
1818         INSIST(sock->connect_ev == NULL);
1819         REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks);
1820
1821         if (sock->fd >= 0) {
1822                 fd = sock->fd;
1823                 sock->fd = -1;
1824                 closesocket(manager, sock, fd);
1825         }
1826
1827         LOCK(&manager->lock);
1828
1829         ISC_LIST_UNLINK(manager->socklist, sock, link);
1830
1831 #ifdef ISC_PLATFORM_USETHREADS
1832         if (ISC_LIST_EMPTY(manager->socklist))
1833                 SIGNAL(&manager->shutdown_ok);
1834 #endif /* ISC_PLATFORM_USETHREADS */
1835
1836         UNLOCK(&manager->lock);
1837
1838         free_socket(sockp);
1839 }
1840
1841 static isc_result_t
1842 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1843                 isc_socket_t **socketp)
1844 {
1845         isc_socket_t *sock;
1846         isc_result_t result;
1847         ISC_SOCKADDR_LEN_T cmsgbuflen;
1848
1849         sock = isc_mem_get(manager->mctx, sizeof(*sock));
1850
1851         if (sock == NULL)
1852                 return (ISC_R_NOMEMORY);
1853
1854         result = ISC_R_UNEXPECTED;
1855
1856         sock->magic = 0;
1857         sock->references = 0;
1858
1859         sock->manager = manager;
1860         sock->type = type;
1861         sock->fd = -1;
1862         sock->statsindex = NULL;
1863
1864         ISC_LINK_INIT(sock, link);
1865
1866         sock->recvcmsgbuf = NULL;
1867         sock->sendcmsgbuf = NULL;
1868
1869         /*
1870          * set up cmsg buffers
1871          */
1872         cmsgbuflen = 0;
1873 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1874         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1875 #endif
1876 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
1877         cmsgbuflen += cmsg_space(sizeof(struct timeval));
1878 #endif
1879         sock->recvcmsgbuflen = cmsgbuflen;
1880         if (sock->recvcmsgbuflen != 0U) {
1881                 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1882                 if (sock->recvcmsgbuf == NULL)
1883                         goto error;
1884         }
1885
1886         cmsgbuflen = 0;
1887 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1888         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1889 #endif
1890         sock->sendcmsgbuflen = cmsgbuflen;
1891         if (sock->sendcmsgbuflen != 0U) {
1892                 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1893                 if (sock->sendcmsgbuf == NULL)
1894                         goto error;
1895         }
1896
1897         memset(sock->name, 0, sizeof(sock->name));
1898         sock->tag = NULL;
1899
1900         /*
1901          * set up list of readers and writers to be initially empty
1902          */
1903         ISC_LIST_INIT(sock->recv_list);
1904         ISC_LIST_INIT(sock->send_list);
1905         ISC_LIST_INIT(sock->accept_list);
1906         sock->connect_ev = NULL;
1907         sock->pending_recv = 0;
1908         sock->pending_send = 0;
1909         sock->pending_accept = 0;
1910         sock->listener = 0;
1911         sock->connected = 0;
1912         sock->connecting = 0;
1913         sock->bound = 0;
1914
1915         /*
1916          * initialize the lock
1917          */
1918         result = isc_mutex_init(&sock->lock);
1919         if (result != ISC_R_SUCCESS) {
1920                 sock->magic = 0;
1921                 goto error;
1922         }
1923
1924         /*
1925          * Initialize readable and writable events
1926          */
1927         ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1928                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1929                        NULL, sock, sock, NULL, NULL);
1930         ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1931                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1932                        NULL, sock, sock, NULL, NULL);
1933
1934         sock->magic = SOCKET_MAGIC;
1935         *socketp = sock;
1936
1937         return (ISC_R_SUCCESS);
1938
1939  error:
1940         if (sock->recvcmsgbuf != NULL)
1941                 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
1942                             sock->recvcmsgbuflen);
1943         if (sock->sendcmsgbuf != NULL)
1944                 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
1945                             sock->sendcmsgbuflen);
1946         isc_mem_put(manager->mctx, sock, sizeof(*sock));
1947
1948         return (result);
1949 }
1950
1951 /*
1952  * This event requires that the various lists be empty, that the reference
1953  * count be 1, and that the magic number is valid.  The other socket bits,
1954  * like the lock, must be initialized as well.  The fd associated must be
1955  * marked as closed, by setting it to -1 on close, or this routine will
1956  * also close the socket.
1957  */
1958 static void
1959 free_socket(isc_socket_t **socketp) {
1960         isc_socket_t *sock = *socketp;
1961
1962         INSIST(sock->references == 0);
1963         INSIST(VALID_SOCKET(sock));
1964         INSIST(!sock->connecting);
1965         INSIST(!sock->pending_recv);
1966         INSIST(!sock->pending_send);
1967         INSIST(!sock->pending_accept);
1968         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1969         INSIST(ISC_LIST_EMPTY(sock->send_list));
1970         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1971         INSIST(!ISC_LINK_LINKED(sock, link));
1972
1973         if (sock->recvcmsgbuf != NULL)
1974                 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
1975                             sock->recvcmsgbuflen);
1976         if (sock->sendcmsgbuf != NULL)
1977                 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
1978                             sock->sendcmsgbuflen);
1979
1980         sock->magic = 0;
1981
1982         DESTROYLOCK(&sock->lock);
1983
1984         isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1985
1986         *socketp = NULL;
1987 }
1988
1989 #ifdef SO_BSDCOMPAT
1990 /*
1991  * This really should not be necessary to do.  Having to workout
1992  * which kernel version we are on at run time so that we don't cause
1993  * the kernel to issue a warning about us using a deprecated socket option.
1994  * Such warnings should *never* be on by default in production kernels.
1995  *
1996  * We can't do this a build time because executables are moved between
1997  * machines and hence kernels.
1998  *
1999  * We can't just not set SO_BSDCOMAT because some kernels require it.
2000  */
2001
2002 static isc_once_t         bsdcompat_once = ISC_ONCE_INIT;
2003 isc_boolean_t bsdcompat = ISC_TRUE;
2004
2005 static void
2006 clear_bsdcompat(void) {
2007 #ifdef __linux__
2008          struct utsname buf;
2009          char *endp;
2010          long int major;
2011          long int minor;
2012
2013          uname(&buf);    /* Can only fail if buf is bad in Linux. */
2014
2015          /* Paranoia in parsing can be increased, but we trust uname(). */
2016          major = strtol(buf.release, &endp, 10);
2017          if (*endp == '.') {
2018                 minor = strtol(endp+1, &endp, 10);
2019                 if ((major > 2) || ((major == 2) && (minor >= 4))) {
2020                         bsdcompat = ISC_FALSE;
2021                 }
2022          }
2023 #endif /* __linux __ */
2024 }
2025 #endif
2026
2027 static isc_result_t
2028 opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) {
2029         char strbuf[ISC_STRERRORSIZE];
2030         const char *err = "socket";
2031         int tries = 0;
2032 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
2033         int on = 1;
2034 #endif
2035 #if defined(SO_RCVBUF)
2036         ISC_SOCKADDR_LEN_T optlen;
2037         int size;
2038 #endif
2039
2040  again:
2041         switch (sock->type) {
2042         case isc_sockettype_udp:
2043                 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2044                 break;
2045         case isc_sockettype_tcp:
2046                 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2047                 break;
2048         case isc_sockettype_unix:
2049                 sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2050                 break;
2051         case isc_sockettype_fdwatch:
2052                 /*
2053                  * We should not be called for isc_sockettype_fdwatch sockets.
2054                  */
2055                 INSIST(0);
2056                 break;
2057         }
2058         if (sock->fd == -1 && errno == EINTR && tries++ < 42)
2059                 goto again;
2060
2061 #ifdef F_DUPFD
2062         /*
2063          * Leave a space for stdio and TCP to work in.
2064          */
2065         if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2066             sock->fd >= 0 && sock->fd < manager->reserved) {
2067                 int new, tmp;
2068                 new = fcntl(sock->fd, F_DUPFD, manager->reserved);
2069                 tmp = errno;
2070                 (void)close(sock->fd);
2071                 errno = tmp;
2072                 sock->fd = new;
2073                 err = "isc_socket_create: fcntl/reserved";
2074         } else if (sock->fd >= 0 && sock->fd < 20) {
2075                 int new, tmp;
2076                 new = fcntl(sock->fd, F_DUPFD, 20);
2077                 tmp = errno;
2078                 (void)close(sock->fd);
2079                 errno = tmp;
2080                 sock->fd = new;
2081                 err = "isc_socket_create: fcntl";
2082         }
2083 #endif
2084
2085         if (sock->fd >= (int)manager->maxsocks) {
2086                 (void)close(sock->fd);
2087                 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2088                                ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2089                                isc_msgcat, ISC_MSGSET_SOCKET,
2090                                ISC_MSG_TOOMANYFDS,
2091                                "socket: file descriptor exceeds limit (%d/%u)",
2092                                sock->fd, manager->maxsocks);
2093                 return (ISC_R_NORESOURCES);
2094         }
2095
2096         if (sock->fd < 0) {
2097                 switch (errno) {
2098                 case EMFILE:
2099                 case ENFILE:
2100                         isc__strerror(errno, strbuf, sizeof(strbuf));
2101                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2102                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2103                                        isc_msgcat, ISC_MSGSET_SOCKET,
2104                                        ISC_MSG_TOOMANYFDS,
2105                                        "%s: %s", err, strbuf);
2106                         /* fallthrough */
2107                 case ENOBUFS:
2108                         return (ISC_R_NORESOURCES);
2109
2110                 case EPROTONOSUPPORT:
2111                 case EPFNOSUPPORT:
2112                 case EAFNOSUPPORT:
2113                 /*
2114                  * Linux 2.2 (and maybe others) return EINVAL instead of
2115                  * EAFNOSUPPORT.
2116                  */
2117                 case EINVAL:
2118                         return (ISC_R_FAMILYNOSUPPORT);
2119
2120                 default:
2121                         isc__strerror(errno, strbuf, sizeof(strbuf));
2122                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2123                                          "%s() %s: %s", err,
2124                                          isc_msgcat_get(isc_msgcat,
2125                                                         ISC_MSGSET_GENERAL,
2126                                                         ISC_MSG_FAILED,
2127                                                         "failed"),
2128                                          strbuf);
2129                         return (ISC_R_UNEXPECTED);
2130                 }
2131         }
2132
2133         if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
2134                 (void)close(sock->fd);
2135                 return (ISC_R_UNEXPECTED);
2136         }
2137
2138 #ifdef SO_BSDCOMPAT
2139         RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
2140                                   clear_bsdcompat) == ISC_R_SUCCESS);
2141         if (sock->type != isc_sockettype_unix && bsdcompat &&
2142             setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
2143                        (void *)&on, sizeof(on)) < 0) {
2144                 isc__strerror(errno, strbuf, sizeof(strbuf));
2145                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2146                                  "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
2147                                  sock->fd,
2148                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2149                                                 ISC_MSG_FAILED, "failed"),
2150                                  strbuf);
2151                 /* Press on... */
2152         }
2153 #endif
2154
2155 #ifdef SO_NOSIGPIPE
2156         if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE,
2157                        (void *)&on, sizeof(on)) < 0) {
2158                 isc__strerror(errno, strbuf, sizeof(strbuf));
2159                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2160                                  "setsockopt(%d, SO_NOSIGPIPE) %s: %s",
2161                                  sock->fd,
2162                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2163                                                 ISC_MSG_FAILED, "failed"),
2164                                  strbuf);
2165                 /* Press on... */
2166         }
2167 #endif
2168
2169 #if defined(USE_CMSG) || defined(SO_RCVBUF)
2170         if (sock->type == isc_sockettype_udp) {
2171
2172 #if defined(USE_CMSG)
2173 #if defined(SO_TIMESTAMP)
2174                 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
2175                                (void *)&on, sizeof(on)) < 0
2176                     && errno != ENOPROTOOPT) {
2177                         isc__strerror(errno, strbuf, sizeof(strbuf));
2178                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2179                                          "setsockopt(%d, SO_TIMESTAMP) %s: %s",
2180                                          sock->fd,
2181                                          isc_msgcat_get(isc_msgcat,
2182                                                         ISC_MSGSET_GENERAL,
2183                                                         ISC_MSG_FAILED,
2184                                                         "failed"),
2185                                          strbuf);
2186                         /* Press on... */
2187                 }
2188 #endif /* SO_TIMESTAMP */
2189
2190 #if defined(ISC_PLATFORM_HAVEIPV6)
2191                 if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
2192                         /*
2193                          * Warn explicitly because this anomaly can be hidden
2194                          * in usual operation (and unexpectedly appear later).
2195                          */
2196                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2197                                          "No buffer available to receive "
2198                                          "IPv6 destination");
2199                 }
2200 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
2201 #ifdef IPV6_RECVPKTINFO
2202                 /* RFC 3542 */
2203                 if ((sock->pf == AF_INET6)
2204                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2205                                    (void *)&on, sizeof(on)) < 0)) {
2206                         isc__strerror(errno, strbuf, sizeof(strbuf));
2207                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2208                                          "setsockopt(%d, IPV6_RECVPKTINFO) "
2209                                          "%s: %s", sock->fd,
2210                                          isc_msgcat_get(isc_msgcat,
2211                                                         ISC_MSGSET_GENERAL,
2212                                                         ISC_MSG_FAILED,
2213                                                         "failed"),
2214                                          strbuf);
2215                 }
2216 #else
2217                 /* RFC 2292 */
2218                 if ((sock->pf == AF_INET6)
2219                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2220                                    (void *)&on, sizeof(on)) < 0)) {
2221                         isc__strerror(errno, strbuf, sizeof(strbuf));
2222                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2223                                          "setsockopt(%d, IPV6_PKTINFO) %s: %s",
2224                                          sock->fd,
2225                                          isc_msgcat_get(isc_msgcat,
2226                                                         ISC_MSGSET_GENERAL,
2227                                                         ISC_MSG_FAILED,
2228                                                         "failed"),
2229                                          strbuf);
2230                 }
2231 #endif /* IPV6_RECVPKTINFO */
2232 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
2233 #ifdef IPV6_USE_MIN_MTU        /* RFC 3542, not too common yet*/
2234                 /* use minimum MTU */
2235                 if (sock->pf == AF_INET6) {
2236                         (void)setsockopt(sock->fd, IPPROTO_IPV6,
2237                                          IPV6_USE_MIN_MTU,
2238                                          (void *)&on, sizeof(on));
2239                 }
2240 #endif
2241 #if defined(IPV6_MTU)
2242                 /*
2243                  * Use minimum MTU on IPv6 sockets.
2244                  */
2245                 if (sock->pf == AF_INET6) {
2246                         int mtu = 1280;
2247                         (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU,
2248                                          &mtu, sizeof(mtu));
2249                 }
2250 #endif
2251 #if defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DONT)
2252                 /*
2253                  * Turn off Path MTU discovery on IPv6/UDP sockets.
2254                  */
2255                 if (sock->pf == AF_INET6) {
2256                         int action = IPV6_PMTUDISC_DONT;
2257                         (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
2258                                          &action, sizeof(action));
2259                 }
2260 #endif
2261 #endif /* ISC_PLATFORM_HAVEIPV6 */
2262 #endif /* defined(USE_CMSG) */
2263
2264 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
2265                 /*
2266                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2267                  */
2268                 if (sock->pf == AF_INET) {
2269                         int action = IP_PMTUDISC_DONT;
2270                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2271                                          &action, sizeof(action));
2272                 }
2273 #endif
2274 #if defined(IP_DONTFRAG)
2275                 /*
2276                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2277                  */
2278                 if (sock->pf == AF_INET) {
2279                         int off = 0;
2280                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG,
2281                                          &off, sizeof(off));
2282                 }
2283 #endif
2284
2285 #if defined(SO_RCVBUF)
2286                 optlen = sizeof(size);
2287                 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2288                                (void *)&size, &optlen) >= 0 &&
2289                      size < RCVBUFSIZE) {
2290                         size = RCVBUFSIZE;
2291                         if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2292                                        (void *)&size, sizeof(size)) == -1) {
2293                                 isc__strerror(errno, strbuf, sizeof(strbuf));
2294                                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2295                                         "setsockopt(%d, SO_RCVBUF, %d) %s: %s",
2296                                         sock->fd, size,
2297                                         isc_msgcat_get(isc_msgcat,
2298                                                        ISC_MSGSET_GENERAL,
2299                                                        ISC_MSG_FAILED,
2300                                                        "failed"),
2301                                         strbuf);
2302                         }
2303                 }
2304 #endif
2305         }
2306 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
2307
2308         inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2309
2310         return (ISC_R_SUCCESS);
2311 }
2312
2313 /*%
2314  * Create a new 'type' socket managed by 'manager'.  Events
2315  * will be posted to 'task' and when dispatched 'action' will be
2316  * called with 'arg' as the arg value.  The new socket is returned
2317  * in 'socketp'.
2318  */
2319 isc_result_t
2320 isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
2321                   isc_socket_t **socketp)
2322 {
2323         isc_socket_t *sock = NULL;
2324         isc_result_t result;
2325         int lockid;
2326
2327         REQUIRE(VALID_MANAGER(manager));
2328         REQUIRE(socketp != NULL && *socketp == NULL);
2329         REQUIRE(type != isc_sockettype_fdwatch);
2330
2331         result = allocate_socket(manager, type, &sock);
2332         if (result != ISC_R_SUCCESS)
2333                 return (result);
2334
2335         switch (sock->type) {
2336         case isc_sockettype_udp:
2337                 sock->statsindex =
2338                         (pf == AF_INET) ? upd4statsindex : upd6statsindex;
2339                 break;
2340         case isc_sockettype_tcp:
2341                 sock->statsindex =
2342                         (pf == AF_INET) ? tcp4statsindex : tcp6statsindex;
2343                 break;
2344         case isc_sockettype_unix:
2345                 sock->statsindex = unixstatsindex;
2346                 break;
2347         default:
2348                 INSIST(0);
2349         }
2350
2351         sock->pf = pf;
2352         result = opensocket(manager, sock);
2353         if (result != ISC_R_SUCCESS) {
2354                 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2355                 free_socket(&sock);
2356                 return (result);
2357         }
2358
2359         sock->references = 1;
2360         *socketp = sock;
2361
2362         /*
2363          * Note we don't have to lock the socket like we normally would because
2364          * there are no external references to it yet.
2365          */
2366
2367         lockid = FDLOCK_ID(sock->fd);
2368         LOCK(&manager->fdlock[lockid]);
2369         manager->fds[sock->fd] = sock;
2370         manager->fdstate[sock->fd] = MANAGED;
2371 #ifdef USE_DEVPOLL
2372         INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2373                sock->manager->fdpollinfo[sock->fd].want_write == 0);
2374 #endif
2375         UNLOCK(&manager->fdlock[lockid]);
2376
2377         LOCK(&manager->lock);
2378         ISC_LIST_APPEND(manager->socklist, sock, link);
2379 #ifdef USE_SELECT
2380         if (manager->maxfd < sock->fd)
2381                 manager->maxfd = sock->fd;
2382 #endif
2383         UNLOCK(&manager->lock);
2384
2385         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2386                    ISC_MSG_CREATED, "created");
2387
2388         return (ISC_R_SUCCESS);
2389 }
2390
2391 isc_result_t
2392 isc_socket_open(isc_socket_t *sock) {
2393         isc_result_t result;
2394
2395         REQUIRE(VALID_SOCKET(sock));
2396
2397         LOCK(&sock->lock);
2398         REQUIRE(sock->references == 1);
2399         REQUIRE(sock->type != isc_sockettype_fdwatch);
2400         UNLOCK(&sock->lock);
2401         /*
2402          * We don't need to retain the lock hereafter, since no one else has
2403          * this socket.
2404          */
2405         REQUIRE(sock->fd == -1);
2406
2407         result = opensocket(sock->manager, sock);
2408         if (result != ISC_R_SUCCESS)
2409                 sock->fd = -1;
2410
2411         if (result == ISC_R_SUCCESS) {
2412                 int lockid = FDLOCK_ID(sock->fd);
2413
2414                 LOCK(&sock->manager->fdlock[lockid]);
2415                 sock->manager->fds[sock->fd] = sock;
2416                 sock->manager->fdstate[sock->fd] = MANAGED;
2417 #ifdef USE_DEVPOLL
2418                 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2419                        sock->manager->fdpollinfo[sock->fd].want_write == 0);
2420 #endif
2421                 UNLOCK(&sock->manager->fdlock[lockid]);
2422
2423 #ifdef USE_SELECT
2424                 LOCK(&sock->manager->lock);
2425                 if (sock->manager->maxfd < sock->fd)
2426                         sock->manager->maxfd = sock->fd;
2427                 UNLOCK(&sock->manager->lock);
2428 #endif
2429         }
2430
2431         return (result);
2432 }
2433
2434 /*
2435  * Create a new 'type' socket managed by 'manager'.  Events
2436  * will be posted to 'task' and when dispatched 'action' will be
2437  * called with 'arg' as the arg value.  The new socket is returned
2438  * in 'socketp'.
2439  */
2440 isc_result_t
2441 isc_socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
2442                          isc_sockfdwatch_t callback, void *cbarg,
2443                          isc_task_t *task, isc_socket_t **socketp)
2444 {
2445         isc_socket_t *sock = NULL;
2446         isc_result_t result;
2447         int lockid;
2448
2449         REQUIRE(VALID_MANAGER(manager));
2450         REQUIRE(socketp != NULL && *socketp == NULL);
2451
2452         result = allocate_socket(manager, isc_sockettype_fdwatch, &sock);
2453         if (result != ISC_R_SUCCESS)
2454                 return (result);
2455
2456         sock->fd = fd;
2457         sock->fdwatcharg = cbarg;
2458         sock->fdwatchcb = callback;
2459         sock->fdwatchflags = flags;
2460         sock->fdwatchtask = task;
2461         sock->statsindex = fdwatchstatsindex;
2462
2463         sock->references = 1;
2464         *socketp = sock;
2465
2466         /*
2467          * Note we don't have to lock the socket like we normally would because
2468          * there are no external references to it yet.
2469          */
2470
2471         lockid = FDLOCK_ID(sock->fd);
2472         LOCK(&manager->fdlock[lockid]);
2473         manager->fds[sock->fd] = sock;
2474         manager->fdstate[sock->fd] = MANAGED;
2475         UNLOCK(&manager->fdlock[lockid]);
2476
2477         LOCK(&manager->lock);
2478         ISC_LIST_APPEND(manager->socklist, sock, link);
2479 #ifdef USE_SELECT
2480         if (manager->maxfd < sock->fd)
2481                 manager->maxfd = sock->fd;
2482 #endif
2483         UNLOCK(&manager->lock);
2484
2485         if (flags & ISC_SOCKFDWATCH_READ)
2486                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2487         if (flags & ISC_SOCKFDWATCH_WRITE)
2488                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2489
2490         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2491                    ISC_MSG_CREATED, "fdwatch-created");
2492
2493         return (ISC_R_SUCCESS);
2494 }
2495
2496 /*
2497  * Attach to a socket.  Caller must explicitly detach when it is done.
2498  */
2499 void
2500 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
2501         REQUIRE(VALID_SOCKET(sock));
2502         REQUIRE(socketp != NULL && *socketp == NULL);
2503
2504         LOCK(&sock->lock);
2505         sock->references++;
2506         UNLOCK(&sock->lock);
2507
2508         *socketp = sock;
2509 }
2510
2511 /*
2512  * Dereference a socket.  If this is the last reference to it, clean things
2513  * up by destroying the socket.
2514  */
2515 void
2516 isc_socket_detach(isc_socket_t **socketp) {
2517         isc_socket_t *sock;
2518         isc_boolean_t kill_socket = ISC_FALSE;
2519
2520         REQUIRE(socketp != NULL);
2521         sock = *socketp;
2522         REQUIRE(VALID_SOCKET(sock));
2523
2524         LOCK(&sock->lock);
2525         REQUIRE(sock->references > 0);
2526         sock->references--;
2527         if (sock->references == 0)
2528                 kill_socket = ISC_TRUE;
2529         UNLOCK(&sock->lock);
2530
2531         if (kill_socket)
2532                 destroy(&sock);
2533
2534         *socketp = NULL;
2535 }
2536
2537 isc_result_t
2538 isc_socket_close(isc_socket_t *sock) {
2539         int fd;
2540         isc_socketmgr_t *manager;
2541         isc_sockettype_t type;
2542
2543         REQUIRE(VALID_SOCKET(sock));
2544
2545         LOCK(&sock->lock);
2546
2547         REQUIRE(sock->references == 1);
2548         REQUIRE(sock->type != isc_sockettype_fdwatch);
2549         REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2550
2551         INSIST(!sock->connecting);
2552         INSIST(!sock->pending_recv);
2553         INSIST(!sock->pending_send);
2554         INSIST(!sock->pending_accept);
2555         INSIST(ISC_LIST_EMPTY(sock->recv_list));
2556         INSIST(ISC_LIST_EMPTY(sock->send_list));
2557         INSIST(ISC_LIST_EMPTY(sock->accept_list));
2558         INSIST(sock->connect_ev == NULL);
2559
2560         manager = sock->manager;
2561         type = sock->type;
2562         fd = sock->fd;
2563         sock->fd = -1;
2564         memset(sock->name, 0, sizeof(sock->name));
2565         sock->tag = NULL;
2566         sock->listener = 0;
2567         sock->connected = 0;
2568         sock->connecting = 0;
2569         sock->bound = 0;
2570         isc_sockaddr_any(&sock->peer_address);
2571
2572         UNLOCK(&sock->lock);
2573
2574         closesocket(manager, sock, fd);
2575
2576         return (ISC_R_SUCCESS);
2577 }
2578
2579 /*
2580  * I/O is possible on a given socket.  Schedule an event to this task that
2581  * will call an internal function to do the I/O.  This will charge the
2582  * task with the I/O operation and let our select loop handler get back
2583  * to doing something real as fast as possible.
2584  *
2585  * The socket and manager must be locked before calling this function.
2586  */
2587 static void
2588 dispatch_recv(isc_socket_t *sock) {
2589         intev_t *iev;
2590         isc_socketevent_t *ev;
2591         isc_task_t *sender;
2592
2593         INSIST(!sock->pending_recv);
2594
2595         if (sock->type != isc_sockettype_fdwatch) {
2596                 ev = ISC_LIST_HEAD(sock->recv_list);
2597                 if (ev == NULL)
2598                         return;
2599                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2600                            "dispatch_recv:  event %p -> task %p",
2601                            ev, ev->ev_sender);
2602                 sender = ev->ev_sender;
2603         } else {
2604                 sender = sock->fdwatchtask;
2605         }
2606
2607         sock->pending_recv = 1;
2608         iev = &sock->readable_ev;
2609
2610         sock->references++;
2611         iev->ev_sender = sock;
2612         if (sock->type == isc_sockettype_fdwatch)
2613                 iev->ev_action = internal_fdwatch_read;
2614         else
2615                 iev->ev_action = internal_recv;
2616         iev->ev_arg = sock;
2617
2618         isc_task_send(sender, (isc_event_t **)&iev);
2619 }
2620
2621 static void
2622 dispatch_send(isc_socket_t *sock) {
2623         intev_t *iev;
2624         isc_socketevent_t *ev;
2625         isc_task_t *sender;
2626
2627         INSIST(!sock->pending_send);
2628
2629         if (sock->type != isc_sockettype_fdwatch) {
2630                 ev = ISC_LIST_HEAD(sock->send_list);
2631                 if (ev == NULL)
2632                         return;
2633                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2634                            "dispatch_send:  event %p -> task %p",
2635                            ev, ev->ev_sender);
2636                 sender = ev->ev_sender;
2637         } else {
2638                 sender = sock->fdwatchtask;
2639         }
2640
2641         sock->pending_send = 1;
2642         iev = &sock->writable_ev;
2643
2644         sock->references++;
2645         iev->ev_sender = sock;
2646         if (sock->type == isc_sockettype_fdwatch)
2647                 iev->ev_action = internal_fdwatch_write;
2648         else
2649                 iev->ev_action = internal_send;
2650         iev->ev_arg = sock;
2651
2652         isc_task_send(sender, (isc_event_t **)&iev);
2653 }
2654
2655 /*
2656  * Dispatch an internal accept event.
2657  */
2658 static void
2659 dispatch_accept(isc_socket_t *sock) {
2660         intev_t *iev;
2661         isc_socket_newconnev_t *ev;
2662
2663         INSIST(!sock->pending_accept);
2664
2665         /*
2666          * Are there any done events left, or were they all canceled
2667          * before the manager got the socket lock?
2668          */
2669         ev = ISC_LIST_HEAD(sock->accept_list);
2670         if (ev == NULL)
2671                 return;
2672
2673         sock->pending_accept = 1;
2674         iev = &sock->readable_ev;
2675
2676         sock->references++;  /* keep socket around for this internal event */
2677         iev->ev_sender = sock;
2678         iev->ev_action = internal_accept;
2679         iev->ev_arg = sock;
2680
2681         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2682 }
2683
2684 static void
2685 dispatch_connect(isc_socket_t *sock) {
2686         intev_t *iev;
2687         isc_socket_connev_t *ev;
2688
2689         iev = &sock->writable_ev;
2690
2691         ev = sock->connect_ev;
2692         INSIST(ev != NULL); /* XXX */
2693
2694         INSIST(sock->connecting);
2695
2696         sock->references++;  /* keep socket around for this internal event */
2697         iev->ev_sender = sock;
2698         iev->ev_action = internal_connect;
2699         iev->ev_arg = sock;
2700
2701         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2702 }
2703
2704 /*
2705  * Dequeue an item off the given socket's read queue, set the result code
2706  * in the done event to the one provided, and send it to the task it was
2707  * destined for.
2708  *
2709  * If the event to be sent is on a list, remove it before sending.  If
2710  * asked to, send and detach from the socket as well.
2711  *
2712  * Caller must have the socket locked if the event is attached to the socket.
2713  */
2714 static void
2715 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2716         isc_task_t *task;
2717
2718         task = (*dev)->ev_sender;
2719
2720         (*dev)->ev_sender = sock;
2721
2722         if (ISC_LINK_LINKED(*dev, ev_link))
2723                 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2724
2725         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
2726             == ISC_SOCKEVENTATTR_ATTACHED)
2727                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
2728         else
2729                 isc_task_send(task, (isc_event_t **)dev);
2730 }
2731
2732 /*
2733  * See comments for send_recvdone_event() above.
2734  *
2735  * Caller must have the socket locked if the event is attached to the socket.
2736  */
2737 static void
2738 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2739         isc_task_t *task;
2740
2741         INSIST(dev != NULL && *dev != NULL);
2742
2743         task = (*dev)->ev_sender;
2744         (*dev)->ev_sender = sock;
2745
2746         if (ISC_LINK_LINKED(*dev, ev_link))
2747                 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
2748
2749         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
2750             == ISC_SOCKEVENTATTR_ATTACHED)
2751                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
2752         else
2753                 isc_task_send(task, (isc_event_t **)dev);
2754 }
2755
2756 /*
2757  * Call accept() on a socket, to get the new file descriptor.  The listen
2758  * socket is used as a prototype to create a new isc_socket_t.  The new
2759  * socket has one outstanding reference.  The task receiving the event
2760  * will be detached from just after the event is delivered.
2761  *
2762  * On entry to this function, the event delivered is the internal
2763  * readable event, and the first item on the accept_list should be
2764  * the done event we want to send.  If the list is empty, this is a no-op,
2765  * so just unlock and return.
2766  */
2767 static void
2768 internal_accept(isc_task_t *me, isc_event_t *ev) {
2769         isc_socket_t *sock;
2770         isc_socketmgr_t *manager;
2771         isc_socket_newconnev_t *dev;
2772         isc_task_t *task;
2773         ISC_SOCKADDR_LEN_T addrlen;
2774         int fd;
2775         isc_result_t result = ISC_R_SUCCESS;
2776         char strbuf[ISC_STRERRORSIZE];
2777         const char *err = "accept";
2778
2779         UNUSED(me);
2780
2781         sock = ev->ev_sender;
2782         INSIST(VALID_SOCKET(sock));
2783
2784         LOCK(&sock->lock);
2785         socket_log(sock, NULL, TRACE,
2786                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2787                    "internal_accept called, locked socket");
2788
2789         manager = sock->manager;
2790         INSIST(VALID_MANAGER(manager));
2791
2792         INSIST(sock->listener);
2793         INSIST(sock->pending_accept == 1);
2794         sock->pending_accept = 0;
2795
2796         INSIST(sock->references > 0);
2797         sock->references--;  /* the internal event is done with this socket */
2798         if (sock->references == 0) {
2799                 UNLOCK(&sock->lock);
2800                 destroy(&sock);
2801                 return;
2802         }
2803
2804         /*
2805          * Get the first item off the accept list.
2806          * If it is empty, unlock the socket and return.
2807          */
2808         dev = ISC_LIST_HEAD(sock->accept_list);
2809         if (dev == NULL) {
2810                 UNLOCK(&sock->lock);
2811                 return;
2812         }
2813
2814         /*
2815          * Try to accept the new connection.  If the accept fails with
2816          * EAGAIN or EINTR, simply poke the watcher to watch this socket
2817          * again.  Also ignore ECONNRESET, which has been reported to
2818          * be spuriously returned on Linux 2.2.19 although it is not
2819          * a documented error for accept().  ECONNABORTED has been
2820          * reported for Solaris 8.  The rest are thrown in not because
2821          * we have seen them but because they are ignored by other
2822          * daemons such as BIND 8 and Apache.
2823          */
2824
2825         addrlen = sizeof(dev->newsocket->peer_address.type);
2826         memset(&dev->newsocket->peer_address.type, 0, addrlen);
2827         fd = accept(sock->fd, &dev->newsocket->peer_address.type.sa,
2828                     (void *)&addrlen);
2829
2830 #ifdef F_DUPFD
2831         /*
2832          * Leave a space for stdio to work in.
2833          */
2834         if (fd >= 0 && fd < 20) {
2835                 int new, tmp;
2836                 new = fcntl(fd, F_DUPFD, 20);
2837                 tmp = errno;
2838                 (void)close(fd);
2839                 errno = tmp;
2840                 fd = new;
2841                 err = "accept/fcntl";
2842         }
2843 #endif
2844
2845         if (fd < 0) {
2846                 if (SOFT_ERROR(errno))
2847                         goto soft_error;
2848                 switch (errno) {
2849                 case ENFILE:
2850                 case EMFILE:
2851                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2852                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2853                                        isc_msgcat, ISC_MSGSET_SOCKET,
2854                                        ISC_MSG_TOOMANYFDS,
2855                                        "%s: too many open file descriptors",
2856                                        err);
2857                         goto soft_error;
2858
2859                 case ENOBUFS:
2860                 case ENOMEM:
2861                 case ECONNRESET:
2862                 case ECONNABORTED:
2863                 case EHOSTUNREACH:
2864                 case EHOSTDOWN:
2865                 case ENETUNREACH:
2866                 case ENETDOWN:
2867                 case ECONNREFUSED:
2868 #ifdef EPROTO
2869                 case EPROTO:
2870 #endif
2871 #ifdef ENONET
2872                 case ENONET:
2873 #endif
2874                         goto soft_error;
2875                 default:
2876                         break;
2877                 }
2878                 isc__strerror(errno, strbuf, sizeof(strbuf));
2879                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2880                                  "internal_accept: %s() %s: %s", err,
2881                                  isc_msgcat_get(isc_msgcat,
2882                                                 ISC_MSGSET_GENERAL,
2883                                                 ISC_MSG_FAILED,
2884                                                 "failed"),
2885                                  strbuf);
2886                 fd = -1;
2887                 result = ISC_R_UNEXPECTED;
2888         } else {
2889                 if (addrlen == 0U) {
2890                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2891                                          "internal_accept(): "
2892                                          "accept() failed to return "
2893                                          "remote address");
2894
2895                         (void)close(fd);
2896                         goto soft_error;
2897                 } else if (dev->newsocket->peer_address.type.sa.sa_family !=
2898                            sock->pf)
2899                 {
2900                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2901                                          "internal_accept(): "
2902                                          "accept() returned peer address "
2903                                          "family %u (expected %u)",
2904                                          dev->newsocket->peer_address.
2905                                          type.sa.sa_family,
2906                                          sock->pf);
2907                         (void)close(fd);
2908                         goto soft_error;
2909                 } else if (fd >= (int)manager->maxsocks) {
2910                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2911                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2912                                        isc_msgcat, ISC_MSGSET_SOCKET,
2913                                        ISC_MSG_TOOMANYFDS,
2914                                        "accept: "
2915                                        "file descriptor exceeds limit (%d/%u)",
2916                                        fd, manager->maxsocks);
2917                         (void)close(fd);
2918                         goto soft_error;
2919                 }
2920         }
2921
2922         if (fd != -1) {
2923                 dev->newsocket->peer_address.length = addrlen;
2924                 dev->newsocket->pf = sock->pf;
2925         }
2926
2927         /*
2928          * Pull off the done event.
2929          */
2930         ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2931
2932         /*
2933          * Poke watcher if there are more pending accepts.
2934          */
2935         if (!ISC_LIST_EMPTY(sock->accept_list))
2936                 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2937
2938         UNLOCK(&sock->lock);
2939
2940         if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) {
2941                 (void)close(fd);
2942                 fd = -1;
2943                 result = ISC_R_UNEXPECTED;
2944         }
2945
2946         /*
2947          * -1 means the new socket didn't happen.
2948          */
2949         if (fd != -1) {
2950                 int lockid = FDLOCK_ID(fd);
2951
2952                 LOCK(&manager->fdlock[lockid]);
2953                 manager->fds[fd] = dev->newsocket;
2954                 manager->fdstate[fd] = MANAGED;
2955                 UNLOCK(&manager->fdlock[lockid]);
2956
2957                 LOCK(&manager->lock);
2958                 ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
2959
2960                 dev->newsocket->fd = fd;
2961                 dev->newsocket->bound = 1;
2962                 dev->newsocket->connected = 1;
2963
2964                 /*
2965                  * Save away the remote address
2966                  */
2967                 dev->address = dev->newsocket->peer_address;
2968
2969 #ifdef USE_SELECT
2970                 if (manager->maxfd < fd)
2971                         manager->maxfd = fd;
2972 #endif
2973
2974                 socket_log(sock, &dev->newsocket->peer_address, CREATION,
2975                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2976                            "accepted connection, new socket %p",
2977                            dev->newsocket);
2978
2979                 UNLOCK(&manager->lock);
2980
2981                 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
2982         } else {
2983                 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
2984                 dev->newsocket->references--;
2985                 free_socket(&dev->newsocket);
2986         }
2987
2988         /*
2989          * Fill in the done event details and send it off.
2990          */
2991         dev->result = result;
2992         task = dev->ev_sender;
2993         dev->ev_sender = sock;
2994
2995         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2996         return;
2997
2998  soft_error:
2999         select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
3000         UNLOCK(&sock->lock);
3001
3002         inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3003         return;
3004 }
3005
3006 static void
3007 internal_recv(isc_task_t *me, isc_event_t *ev) {
3008         isc_socketevent_t *dev;
3009         isc_socket_t *sock;
3010
3011         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3012
3013         sock = ev->ev_sender;
3014         INSIST(VALID_SOCKET(sock));
3015
3016         LOCK(&sock->lock);
3017         socket_log(sock, NULL, IOEVENT,
3018                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3019                    "internal_recv: task %p got event %p", me, ev);
3020
3021         INSIST(sock->pending_recv == 1);
3022         sock->pending_recv = 0;
3023
3024         INSIST(sock->references > 0);
3025         sock->references--;  /* the internal event is done with this socket */
3026         if (sock->references == 0) {
3027                 UNLOCK(&sock->lock);
3028                 destroy(&sock);
3029                 return;
3030         }
3031
3032         /*
3033          * Try to do as much I/O as possible on this socket.  There are no
3034          * limits here, currently.
3035          */
3036         dev = ISC_LIST_HEAD(sock->recv_list);
3037         while (dev != NULL) {
3038                 switch (doio_recv(sock, dev)) {
3039                 case DOIO_SOFT:
3040                         goto poke;
3041
3042                 case DOIO_EOF:
3043                         /*
3044                          * read of 0 means the remote end was closed.
3045                          * Run through the event queue and dispatch all
3046                          * the events with an EOF result code.
3047                          */
3048                         do {
3049                                 dev->result = ISC_R_EOF;
3050                                 send_recvdone_event(sock, &dev);
3051                                 dev = ISC_LIST_HEAD(sock->recv_list);
3052                         } while (dev != NULL);
3053                         goto poke;
3054
3055                 case DOIO_SUCCESS:
3056                 case DOIO_HARD:
3057                         send_recvdone_event(sock, &dev);
3058                         break;
3059                 }
3060
3061                 dev = ISC_LIST_HEAD(sock->recv_list);
3062         }
3063
3064  poke:
3065         if (!ISC_LIST_EMPTY(sock->recv_list))
3066                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3067
3068         UNLOCK(&sock->lock);
3069 }
3070
3071 static void
3072 internal_send(isc_task_t *me, isc_event_t *ev) {
3073         isc_socketevent_t *dev;
3074         isc_socket_t *sock;
3075
3076         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3077
3078         /*
3079          * Find out what socket this is and lock it.
3080          */
3081         sock = (isc_socket_t *)ev->ev_sender;
3082         INSIST(VALID_SOCKET(sock));
3083
3084         LOCK(&sock->lock);
3085         socket_log(sock, NULL, IOEVENT,
3086                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3087                    "internal_send: task %p got event %p", me, ev);
3088
3089         INSIST(sock->pending_send == 1);
3090         sock->pending_send = 0;
3091
3092         INSIST(sock->references > 0);
3093         sock->references--;  /* the internal event is done with this socket */
3094         if (sock->references == 0) {
3095                 UNLOCK(&sock->lock);
3096                 destroy(&sock);
3097                 return;
3098         }
3099
3100         /*
3101          * Try to do as much I/O as possible on this socket.  There are no
3102          * limits here, currently.
3103          */
3104         dev = ISC_LIST_HEAD(sock->send_list);
3105         while (dev != NULL) {
3106                 switch (doio_send(sock, dev)) {
3107                 case DOIO_SOFT:
3108                         goto poke;
3109
3110                 case DOIO_HARD:
3111                 case DOIO_SUCCESS:
3112                         send_senddone_event(sock, &dev);
3113                         break;
3114                 }
3115
3116                 dev = ISC_LIST_HEAD(sock->send_list);
3117         }
3118
3119  poke:
3120         if (!ISC_LIST_EMPTY(sock->send_list))
3121                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3122
3123         UNLOCK(&sock->lock);
3124 }
3125
3126 static void
3127 internal_fdwatch_write(isc_task_t *me, isc_event_t *ev) {
3128         isc_socket_t *sock;
3129         int more_data;
3130
3131         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3132
3133         /*
3134          * Find out what socket this is and lock it.
3135          */
3136         sock = (isc_socket_t *)ev->ev_sender;
3137         INSIST(VALID_SOCKET(sock));
3138
3139         LOCK(&sock->lock);
3140         socket_log(sock, NULL, IOEVENT,
3141                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3142                    "internal_fdwatch_write: task %p got event %p", me, ev);
3143
3144         INSIST(sock->pending_send == 1);
3145
3146         UNLOCK(&sock->lock);
3147         more_data = (sock->fdwatchcb)(me, sock, sock->fdwatcharg);
3148         LOCK(&sock->lock);
3149
3150         sock->pending_send = 0;
3151
3152         INSIST(sock->references > 0);
3153         sock->references--;  /* the internal event is done with this socket */
3154         if (sock->references == 0) {
3155                 UNLOCK(&sock->lock);
3156                 destroy(&sock);
3157                 return;
3158         }
3159
3160         if (more_data)
3161                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3162
3163         UNLOCK(&sock->lock);
3164 }
3165
3166 static void
3167 internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) {
3168         isc_socket_t *sock;
3169         int more_data;
3170
3171         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3172
3173         /*
3174          * Find out what socket this is and lock it.
3175          */
3176         sock = (isc_socket_t *)ev->ev_sender;
3177         INSIST(VALID_SOCKET(sock));
3178
3179         LOCK(&sock->lock);
3180         socket_log(sock, NULL, IOEVENT,
3181                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3182                    "internal_fdwatch_read: task %p got event %p", me, ev);
3183
3184         INSIST(sock->pending_recv == 1);
3185
3186         UNLOCK(&sock->lock);
3187         more_data = (sock->fdwatchcb)(me, sock, sock->fdwatcharg);
3188         LOCK(&sock->lock);
3189
3190         sock->pending_recv = 0;
3191
3192         INSIST(sock->references > 0);
3193         sock->references--;  /* the internal event is done with this socket */
3194         if (sock->references == 0) {
3195                 UNLOCK(&sock->lock);
3196                 destroy(&sock);
3197                 return;
3198         }
3199
3200         if (more_data)
3201                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3202
3203         UNLOCK(&sock->lock);
3204 }
3205
3206 /*
3207  * Process read/writes on each fd here.  Avoid locking
3208  * and unlocking twice if both reads and writes are possible.
3209  */
3210 static void
3211 process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable,
3212            isc_boolean_t writeable)
3213 {
3214         isc_socket_t *sock;
3215         isc_boolean_t unlock_sock;
3216         isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
3217         int lockid = FDLOCK_ID(fd);
3218
3219         /*
3220          * If the socket is going to be closed, don't do more I/O.
3221          */
3222         LOCK(&manager->fdlock[lockid]);
3223         if (manager->fdstate[fd] == CLOSE_PENDING) {
3224                 UNLOCK(&manager->fdlock[lockid]);
3225
3226                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3227                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3228                 return;
3229         }
3230
3231         sock = manager->fds[fd];
3232         unlock_sock = ISC_FALSE;
3233         if (readable) {
3234                 if (sock == NULL) {
3235                         unwatch_read = ISC_TRUE;
3236                         goto check_write;
3237                 }
3238                 unlock_sock = ISC_TRUE;
3239                 LOCK(&sock->lock);
3240                 if (!SOCK_DEAD(sock)) {
3241                         if (sock->listener)
3242                                 dispatch_accept(sock);
3243                         else
3244                                 dispatch_recv(sock);
3245                 }
3246                 unwatch_read = ISC_TRUE;
3247         }
3248 check_write:
3249         if (writeable) {
3250                 if (sock == NULL) {
3251                         unwatch_write = ISC_TRUE;
3252                         goto unlock_fd;
3253                 }
3254                 if (!unlock_sock) {
3255                         unlock_sock = ISC_TRUE;
3256                         LOCK(&sock->lock);
3257                 }
3258                 if (!SOCK_DEAD(sock)) {
3259                         if (sock->connecting)
3260                                 dispatch_connect(sock);
3261                         else
3262                                 dispatch_send(sock);
3263                 }
3264                 unwatch_write = ISC_TRUE;
3265         }
3266         if (unlock_sock)
3267                 UNLOCK(&sock->lock);
3268
3269  unlock_fd:
3270         UNLOCK(&manager->fdlock[lockid]);
3271         if (unwatch_read)
3272                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3273         if (unwatch_write)
3274                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3275
3276 }
3277
3278 #ifdef USE_KQUEUE
3279 static isc_boolean_t
3280 process_fds(isc_socketmgr_t *manager, struct kevent *events, int nevents) {
3281         int i;
3282         isc_boolean_t readable, writable;
3283         isc_boolean_t done = ISC_FALSE;
3284 #ifdef ISC_PLATFORM_USETHREADS
3285         isc_boolean_t have_ctlevent = ISC_FALSE;
3286 #endif
3287
3288         if (nevents == manager->nevents) {
3289                 /*
3290                  * This is not an error, but something unexpected.  If this
3291                  * happens, it may indicate the need for increasing
3292                  * ISC_SOCKET_MAXEVENTS.
3293                  */
3294                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3295                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3296                             "maximum number of FD events (%d) received",
3297                             nevents);
3298         }
3299
3300         for (i = 0; i < nevents; i++) {
3301                 REQUIRE(events[i].ident < manager->maxsocks);
3302 #ifdef ISC_PLATFORM_USETHREADS
3303                 if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) {
3304                         have_ctlevent = ISC_TRUE;
3305                         continue;
3306                 }
3307 #endif
3308                 readable = ISC_TF(events[i].filter == EVFILT_READ);
3309                 writable = ISC_TF(events[i].filter == EVFILT_WRITE);
3310                 process_fd(manager, events[i].ident, readable, writable);
3311         }
3312
3313 #ifdef ISC_PLATFORM_USETHREADS
3314         if (have_ctlevent)
3315                 done = process_ctlfd(manager);
3316 #endif
3317
3318         return (done);
3319 }
3320 #elif defined(USE_EPOLL)
3321 static isc_boolean_t
3322 process_fds(isc_socketmgr_t *manager, struct epoll_event *events, int nevents) {
3323         int i;
3324         isc_boolean_t done = ISC_FALSE;
3325 #ifdef ISC_PLATFORM_USETHREADS
3326         isc_boolean_t have_ctlevent = ISC_FALSE;
3327 #endif
3328
3329         if (nevents == manager->nevents) {
3330                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3331                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3332                             "maximum number of FD events (%d) received",
3333                             nevents);
3334         }
3335
3336         for (i = 0; i < nevents; i++) {
3337                 REQUIRE(events[i].data.fd < (int)manager->maxsocks);
3338 #ifdef ISC_PLATFORM_USETHREADS
3339                 if (events[i].data.fd == manager->pipe_fds[0]) {
3340                         have_ctlevent = ISC_TRUE;
3341                         continue;
3342                 }
3343 #endif
3344                 if ((events[i].events & EPOLLERR) != 0 ||
3345                     (events[i].events & EPOLLHUP) != 0) {
3346                         /*
3347                          * epoll does not set IN/OUT bits on an erroneous
3348                          * condition, so we need to try both anyway.  This is a
3349                          * bit inefficient, but should be okay for such rare
3350                          * events.  Note also that the read or write attempt
3351                          * won't block because we use non-blocking sockets.
3352                          */
3353                         events[i].events |= (EPOLLIN | EPOLLOUT);
3354                 }
3355                 process_fd(manager, events[i].data.fd,
3356                            (events[i].events & EPOLLIN) != 0,
3357                            (events[i].events & EPOLLOUT) != 0);
3358         }
3359
3360 #ifdef ISC_PLATFORM_USETHREADS
3361         if (have_ctlevent)
3362                 done = process_ctlfd(manager);
3363 #endif
3364
3365         return (done);
3366 }
3367 #elif defined(USE_DEVPOLL)
3368 static isc_boolean_t
3369 process_fds(isc_socketmgr_t *manager, struct pollfd *events, int nevents) {
3370         int i;
3371         isc_boolean_t done = ISC_FALSE;
3372 #ifdef ISC_PLATFORM_USETHREADS
3373         isc_boolean_t have_ctlevent = ISC_FALSE;
3374 #endif
3375
3376         if (nevents == manager->nevents) {
3377                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3378                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3379                             "maximum number of FD events (%d) received",
3380                             nevents);
3381         }
3382
3383         for (i = 0; i < nevents; i++) {
3384                 REQUIRE(events[i].fd < (int)manager->maxsocks);
3385 #ifdef ISC_PLATFORM_USETHREADS
3386                 if (events[i].fd == manager->pipe_fds[0]) {
3387                         have_ctlevent = ISC_TRUE;
3388                         continue;
3389                 }
3390 #endif
3391                 process_fd(manager, events[i].fd,
3392                            (events[i].events & POLLIN) != 0,
3393                            (events[i].events & POLLOUT) != 0);
3394         }
3395
3396 #ifdef ISC_PLATFORM_USETHREADS
3397         if (have_ctlevent)
3398                 done = process_ctlfd(manager);
3399 #endif
3400
3401         return (done);
3402 }
3403 #elif defined(USE_SELECT)
3404 static void
3405 process_fds(isc_socketmgr_t *manager, int maxfd,
3406             fd_set *readfds, fd_set *writefds)
3407 {
3408         int i;
3409
3410         REQUIRE(maxfd <= (int)manager->maxsocks);
3411
3412         for (i = 0; i < maxfd; i++) {
3413 #ifdef ISC_PLATFORM_USETHREADS
3414                 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
3415                         continue;
3416 #endif /* ISC_PLATFORM_USETHREADS */
3417                 process_fd(manager, i, FD_ISSET(i, readfds),
3418                            FD_ISSET(i, writefds));
3419         }
3420 }
3421 #endif
3422
3423 #ifdef ISC_PLATFORM_USETHREADS
3424 static isc_boolean_t
3425 process_ctlfd(isc_socketmgr_t *manager) {
3426         int msg, fd;
3427
3428         for (;;) {
3429                 select_readmsg(manager, &fd, &msg);
3430
3431                 manager_log(manager, IOEVENT,
3432                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
3433                                            ISC_MSG_WATCHERMSG,
3434                                            "watcher got message %d "
3435                                            "for socket %d"), msg, fd);
3436
3437                 /*
3438                  * Nothing to read?
3439                  */
3440                 if (msg == SELECT_POKE_NOTHING)
3441                         break;
3442
3443                 /*
3444                  * Handle shutdown message.  We really should
3445                  * jump out of this loop right away, but
3446                  * it doesn't matter if we have to do a little
3447                  * more work first.
3448                  */
3449                 if (msg == SELECT_POKE_SHUTDOWN)
3450                         return (ISC_TRUE);
3451
3452                 /*
3453                  * This is a wakeup on a socket.  Look
3454                  * at the event queue for both read and write,
3455                  * and decide if we need to watch on it now
3456                  * or not.
3457                  */
3458                 wakeup_socket(manager, fd, msg);
3459         }
3460
3461         return (ISC_FALSE);
3462 }
3463
3464 /*
3465  * This is the thread that will loop forever, always in a select or poll
3466  * call.
3467  *
3468  * When select returns something to do, track down what thread gets to do
3469  * this I/O and post the event to it.
3470  */
3471 static isc_threadresult_t
3472 watcher(void *uap) {
3473         isc_socketmgr_t *manager = uap;
3474         isc_boolean_t done;
3475         int ctlfd;
3476         int cc;
3477 #ifdef USE_KQUEUE
3478         const char *fnname = "kevent()";
3479 #elif defined (USE_EPOLL)
3480         const char *fnname = "epoll_wait()";
3481 #elif defined(USE_DEVPOLL)
3482         const char *fnname = "ioctl(DP_POLL)";
3483         struct dvpoll dvp;
3484 #elif defined (USE_SELECT)
3485         const char *fnname = "select()";
3486         int maxfd;
3487 #endif
3488         char strbuf[ISC_STRERRORSIZE];
3489 #ifdef ISC_SOCKET_USE_POLLWATCH
3490         pollstate_t pollstate = poll_idle;
3491 #endif
3492
3493         /*
3494          * Get the control fd here.  This will never change.
3495          */
3496         ctlfd = manager->pipe_fds[0];
3497         done = ISC_FALSE;
3498         while (!done) {
3499                 do {
3500 #ifdef USE_KQUEUE
3501                         cc = kevent(manager->kqueue_fd, NULL, 0,
3502                                     manager->events, manager->nevents, NULL);
3503 #elif defined(USE_EPOLL)
3504                         cc = epoll_wait(manager->epoll_fd, manager->events,
3505                                         manager->nevents, -1);
3506 #elif defined(USE_DEVPOLL)
3507                         dvp.dp_fds = manager->events;
3508                         dvp.dp_nfds = manager->nevents;
3509 #ifndef ISC_SOCKET_USE_POLLWATCH
3510                         dvp.dp_timeout = -1;
3511 #else
3512                         if (pollstate == poll_idle)
3513                                 dvp.dp_timeout = -1;
3514                         else
3515                                 dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT;
3516 #endif  /* ISC_SOCKET_USE_POLLWATCH */
3517                         cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
3518 #elif defined(USE_SELECT)
3519                         LOCK(&manager->lock);
3520                         memcpy(manager->read_fds_copy, manager->read_fds,
3521                                manager->fd_bufsize);
3522                         memcpy(manager->write_fds_copy, manager->write_fds,
3523                                manager->fd_bufsize);
3524                         maxfd = manager->maxfd + 1;
3525                         UNLOCK(&manager->lock);
3526
3527                         cc = select(maxfd, manager->read_fds_copy,
3528                                     manager->write_fds_copy, NULL, NULL);
3529 #endif  /* USE_KQUEUE */
3530
3531                         if (cc < 0 && !SOFT_ERROR(errno)) {
3532                                 isc__strerror(errno, strbuf, sizeof(strbuf));
3533                                 FATAL_ERROR(__FILE__, __LINE__,
3534                                             "%s %s: %s", fnname,
3535                                             isc_msgcat_get(isc_msgcat,
3536                                                            ISC_MSGSET_GENERAL,
3537                                                            ISC_MSG_FAILED,
3538                                                            "failed"), strbuf);
3539                         }
3540
3541 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3542                         if (cc == 0) {
3543                                 if (pollstate == poll_active)
3544                                         pollstate = poll_checking;
3545                                 else if (pollstate == poll_checking)
3546                                         pollstate = poll_idle;
3547                         } else if (cc > 0) {
3548                                 if (pollstate == poll_checking) {
3549                                         /*
3550                                          * XXX: We'd like to use a more
3551                                          * verbose log level as it's actually an
3552                                          * unexpected event, but the kernel bug
3553                                          * reportedly happens pretty frequently
3554                                          * (and it can also be a false positive)
3555                                          * so it would be just too noisy.
3556                                          */
3557                                         manager_log(manager,
3558                                                     ISC_LOGCATEGORY_GENERAL,
3559                                                     ISC_LOGMODULE_SOCKET,
3560                                                     ISC_LOG_DEBUG(1),
3561                                                     "unexpected POLL timeout");
3562                                 }
3563                                 pollstate = poll_active;
3564                         }
3565 #endif
3566                 } while (cc < 0);
3567
3568 #if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL)
3569                 done = process_fds(manager, manager->events, cc);
3570 #elif defined(USE_SELECT)
3571                 process_fds(manager, maxfd, manager->read_fds_copy,
3572                             manager->write_fds_copy);
3573
3574                 /*
3575                  * Process reads on internal, control fd.
3576                  */
3577                 if (FD_ISSET(ctlfd, manager->read_fds_copy))
3578                         done = process_ctlfd(manager);
3579 #endif
3580         }
3581
3582         manager_log(manager, TRACE, "%s",
3583                     isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3584                                    ISC_MSG_EXITING, "watcher exiting"));
3585
3586         return ((isc_threadresult_t)0);
3587 }
3588 #endif /* ISC_PLATFORM_USETHREADS */
3589
3590 void
3591 isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
3592
3593         REQUIRE(VALID_MANAGER(manager));
3594
3595         manager->reserved = reserved;
3596 }
3597
3598 /*
3599  * Create a new socket manager.
3600  */
3601
3602 static isc_result_t
3603 setup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
3604         isc_result_t result;
3605 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3606         char strbuf[ISC_STRERRORSIZE];
3607 #endif
3608
3609 #ifdef USE_KQUEUE
3610         manager->nevents = ISC_SOCKET_MAXEVENTS;
3611         manager->events = isc_mem_get(mctx, sizeof(struct kevent) *
3612                                       manager->nevents);
3613         if (manager->events == NULL)
3614                 return (ISC_R_NOMEMORY);
3615         manager->kqueue_fd = kqueue();
3616         if (manager->kqueue_fd == -1) {
3617                 result = isc__errno2result(errno);
3618                 isc__strerror(errno, strbuf, sizeof(strbuf));
3619                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3620                                  "kqueue %s: %s",
3621                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3622                                                 ISC_MSG_FAILED, "failed"),
3623                                  strbuf);
3624                 isc_mem_put(mctx, manager->events,
3625                             sizeof(struct kevent) * manager->nevents);
3626                 return (result);
3627         }
3628
3629 #ifdef ISC_PLATFORM_USETHREADS
3630         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3631         if (result != ISC_R_SUCCESS) {
3632                 close(manager->kqueue_fd);
3633                 isc_mem_put(mctx, manager->events,
3634                             sizeof(struct kevent) * manager->nevents);
3635                 return (result);
3636         }
3637 #endif  /* ISC_PLATFORM_USETHREADS */
3638 #elif defined(USE_EPOLL)
3639         manager->nevents = ISC_SOCKET_MAXEVENTS;
3640         manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) *
3641                                       manager->nevents);
3642         if (manager->events == NULL)
3643                 return (ISC_R_NOMEMORY);
3644         manager->epoll_fd = epoll_create(manager->nevents);
3645         if (manager->epoll_fd == -1) {
3646                 result = isc__errno2result(errno);
3647                 isc__strerror(errno, strbuf, sizeof(strbuf));
3648                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3649                                  "epoll_create %s: %s",
3650                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3651                                                 ISC_MSG_FAILED, "failed"),
3652                                  strbuf);
3653                 isc_mem_put(mctx, manager->events,
3654                             sizeof(struct epoll_event) * manager->nevents);
3655                 return (result);
3656         }
3657 #ifdef ISC_PLATFORM_USETHREADS
3658         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3659         if (result != ISC_R_SUCCESS) {
3660                 close(manager->epoll_fd);
3661                 isc_mem_put(mctx, manager->events,
3662                             sizeof(struct epoll_event) * manager->nevents);
3663                 return (result);
3664         }
3665 #endif  /* ISC_PLATFORM_USETHREADS */
3666 #elif defined(USE_DEVPOLL)
3667         /*
3668          * XXXJT: /dev/poll seems to reject large numbers of events,
3669          * so we should be careful about redefining ISC_SOCKET_MAXEVENTS.
3670          */
3671         manager->nevents = ISC_SOCKET_MAXEVENTS;
3672         manager->events = isc_mem_get(mctx, sizeof(struct pollfd) *
3673                                       manager->nevents);
3674         if (manager->events == NULL)
3675                 return (ISC_R_NOMEMORY);
3676         /*
3677          * Note: fdpollinfo should be able to support all possible FDs, so
3678          * it must have maxsocks entries (not nevents).
3679          */
3680         manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) *
3681                                           manager->maxsocks);
3682         if (manager->fdpollinfo == NULL) {
3683                 isc_mem_put(mctx, manager->events,
3684                             sizeof(struct pollfd) * manager->nevents);
3685                 return (ISC_R_NOMEMORY);
3686         }
3687         memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks);
3688         manager->devpoll_fd = open("/dev/poll", O_RDWR);
3689         if (manager->devpoll_fd == -1) {
3690                 result = isc__errno2result(errno);
3691                 isc__strerror(errno, strbuf, sizeof(strbuf));
3692                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3693                                  "open(/dev/poll) %s: %s",
3694                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3695                                                 ISC_MSG_FAILED, "failed"),
3696                                  strbuf);
3697                 isc_mem_put(mctx, manager->events,
3698                             sizeof(struct pollfd) * manager->nevents);
3699                 isc_mem_put(mctx, manager->fdpollinfo,
3700                             sizeof(pollinfo_t) * manager->maxsocks);
3701                 return (result);
3702         }
3703 #ifdef ISC_PLATFORM_USETHREADS
3704         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3705         if (result != ISC_R_SUCCESS) {
3706                 close(manager->devpoll_fd);
3707                 isc_mem_put(mctx, manager->events,
3708                             sizeof(struct pollfd) * manager->nevents);
3709                 isc_mem_put(mctx, manager->fdpollinfo,
3710                             sizeof(pollinfo_t) * manager->maxsocks);
3711                 return (result);
3712         }
3713 #endif  /* ISC_PLATFORM_USETHREADS */
3714 #elif defined(USE_SELECT)
3715         UNUSED(result);
3716
3717 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3718         /*
3719          * Note: this code should also cover the case of MAXSOCKETS <=
3720          * FD_SETSIZE, but we separate the cases to avoid possible portability
3721          * issues regarding howmany() and the actual representation of fd_set.
3722          */
3723         manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
3724                 sizeof(fd_mask);
3725 #else
3726         manager->fd_bufsize = sizeof(fd_set);
3727 #endif
3728
3729         manager->read_fds = NULL;
3730         manager->read_fds_copy = NULL;
3731         manager->write_fds = NULL;
3732         manager->write_fds_copy = NULL;
3733
3734         manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
3735         if (manager->read_fds != NULL)
3736                 manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
3737         if (manager->read_fds_copy != NULL)
3738                 manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
3739         if (manager->write_fds != NULL) {
3740                 manager->write_fds_copy = isc_mem_get(mctx,
3741                                                       manager->fd_bufsize);
3742         }
3743         if (manager->write_fds_copy == NULL) {
3744                 if (manager->write_fds != NULL) {
3745                         isc_mem_put(mctx, manager->write_fds,
3746                                     manager->fd_bufsize);
3747                 }
3748                 if (manager->read_fds_copy != NULL) {
3749                         isc_mem_put(mctx, manager->read_fds_copy,
3750                                     manager->fd_bufsize);
3751                 }
3752                 if (manager->read_fds != NULL) {
3753                         isc_mem_put(mctx, manager->read_fds,
3754                                     manager->fd_bufsize);
3755                 }
3756                 return (ISC_R_NOMEMORY);
3757         }
3758         memset(manager->read_fds, 0, manager->fd_bufsize);
3759         memset(manager->write_fds, 0, manager->fd_bufsize);
3760
3761 #ifdef ISC_PLATFORM_USETHREADS
3762         (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3763         manager->maxfd = manager->pipe_fds[0];
3764 #else /* ISC_PLATFORM_USETHREADS */
3765         manager->maxfd = 0;
3766 #endif /* ISC_PLATFORM_USETHREADS */
3767 #endif  /* USE_KQUEUE */
3768
3769         return (ISC_R_SUCCESS);
3770 }
3771
3772 static void
3773 cleanup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
3774 #ifdef ISC_PLATFORM_USETHREADS
3775         isc_result_t result;
3776
3777         result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3778         if (result != ISC_R_SUCCESS) {
3779                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3780                                  "epoll_ctl(DEL) %s",
3781                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3782                                                 ISC_MSG_FAILED, "failed"));
3783         }
3784 #endif  /* ISC_PLATFORM_USETHREADS */
3785
3786 #ifdef USE_KQUEUE
3787         close(manager->kqueue_fd);
3788         isc_mem_put(mctx, manager->events,
3789                     sizeof(struct kevent) * manager->nevents);
3790 #elif defined(USE_EPOLL)
3791         close(manager->epoll_fd);
3792         isc_mem_put(mctx, manager->events,
3793                     sizeof(struct epoll_event) * manager->nevents);
3794 #elif defined(USE_DEVPOLL)
3795         close(manager->devpoll_fd);
3796         isc_mem_put(mctx, manager->events,
3797                     sizeof(struct pollfd) * manager->nevents);
3798         isc_mem_put(mctx, manager->fdpollinfo,
3799                     sizeof(pollinfo_t) * manager->maxsocks);
3800 #elif defined(USE_SELECT)
3801         if (manager->read_fds != NULL)
3802                 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
3803         if (manager->read_fds_copy != NULL)
3804                 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize);
3805         if (manager->write_fds != NULL)
3806                 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
3807         if (manager->write_fds_copy != NULL)
3808                 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
3809 #endif  /* USE_KQUEUE */
3810 }
3811
3812 isc_result_t
3813 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
3814         return (isc_socketmgr_create2(mctx, managerp, 0));
3815 }
3816
3817 isc_result_t
3818 isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
3819                       unsigned int maxsocks)
3820 {
3821         int i;
3822         isc_socketmgr_t *manager;
3823 #ifdef ISC_PLATFORM_USETHREADS
3824         char strbuf[ISC_STRERRORSIZE];
3825 #endif
3826         isc_result_t result;
3827
3828         REQUIRE(managerp != NULL && *managerp == NULL);
3829
3830 #ifndef ISC_PLATFORM_USETHREADS
3831         if (socketmgr != NULL) {
3832                 /* Don't allow maxsocks to be updated */
3833                 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
3834                         return (ISC_R_EXISTS);
3835
3836                 socketmgr->refs++;
3837                 *managerp = socketmgr;
3838                 return (ISC_R_SUCCESS);
3839         }
3840 #endif /* ISC_PLATFORM_USETHREADS */
3841
3842         if (maxsocks == 0)
3843                 maxsocks = ISC_SOCKET_MAXSOCKETS;
3844
3845         manager = isc_mem_get(mctx, sizeof(*manager));
3846         if (manager == NULL)
3847                 return (ISC_R_NOMEMORY);
3848
3849         /* zero-clear so that necessary cleanup on failure will be easy */
3850         memset(manager, 0, sizeof(*manager));
3851         manager->maxsocks = maxsocks;
3852         manager->reserved = 0;
3853         manager->fds = isc_mem_get(mctx,
3854                                    manager->maxsocks * sizeof(isc_socket_t *));
3855         if (manager->fds == NULL) {
3856                 result = ISC_R_NOMEMORY;
3857                 goto free_manager;
3858         }
3859         manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int));
3860         if (manager->fdstate == NULL) {
3861                 result = ISC_R_NOMEMORY;
3862                 goto free_manager;
3863         }
3864         manager->stats = NULL;
3865
3866         manager->magic = SOCKET_MANAGER_MAGIC;
3867         manager->mctx = NULL;
3868         memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
3869         ISC_LIST_INIT(manager->socklist);
3870         result = isc_mutex_init(&manager->lock);
3871         if (result != ISC_R_SUCCESS)
3872                 goto free_manager;
3873         manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t));
3874         if (manager->fdlock == NULL) {
3875                 result = ISC_R_NOMEMORY;
3876                 goto cleanup_lock;
3877         }
3878         for (i = 0; i < FDLOCK_COUNT; i++) {
3879                 result = isc_mutex_init(&manager->fdlock[i]);
3880                 if (result != ISC_R_SUCCESS) {
3881                         while (--i >= 0)
3882                                 DESTROYLOCK(&manager->fdlock[i]);
3883                         isc_mem_put(mctx, manager->fdlock,
3884                                     FDLOCK_COUNT * sizeof(isc_mutex_t));
3885                         manager->fdlock = NULL;
3886                         goto cleanup_lock;
3887                 }
3888         }
3889
3890 #ifdef ISC_PLATFORM_USETHREADS
3891         if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
3892                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3893                                  "isc_condition_init() %s",
3894                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3895                                                 ISC_MSG_FAILED, "failed"));
3896                 result = ISC_R_UNEXPECTED;
3897                 goto cleanup_lock;
3898         }
3899
3900         /*
3901          * Create the special fds that will be used to wake up the
3902          * select/poll loop when something internal needs to be done.
3903          */
3904         if (pipe(manager->pipe_fds) != 0) {
3905                 isc__strerror(errno, strbuf, sizeof(strbuf));
3906                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3907                                  "pipe() %s: %s",
3908                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3909                                                 ISC_MSG_FAILED, "failed"),
3910                                  strbuf);
3911                 result = ISC_R_UNEXPECTED;
3912                 goto cleanup_condition;
3913         }
3914
3915         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
3916 #if 0
3917         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
3918 #endif
3919 #else /* ISC_PLATFORM_USETHREADS */
3920         manager->refs = 1;
3921 #endif /* ISC_PLATFORM_USETHREADS */
3922
3923         /*
3924          * Set up initial state for the select loop
3925          */
3926         result = setup_watcher(mctx, manager);
3927         if (result != ISC_R_SUCCESS)
3928                 goto cleanup;
3929         memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
3930 #ifdef ISC_PLATFORM_USETHREADS
3931         /*
3932          * Start up the select/poll thread.
3933          */
3934         if (isc_thread_create(watcher, manager, &manager->watcher) !=
3935             ISC_R_SUCCESS) {
3936                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3937                                  "isc_thread_create() %s",
3938                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3939                                                 ISC_MSG_FAILED, "failed"));
3940                 cleanup_watcher(mctx, manager);
3941                 result = ISC_R_UNEXPECTED;
3942                 goto cleanup;
3943         }
3944 #endif /* ISC_PLATFORM_USETHREADS */
3945         isc_mem_attach(mctx, &manager->mctx);
3946
3947 #ifndef ISC_PLATFORM_USETHREADS
3948         socketmgr = manager;
3949 #endif /* ISC_PLATFORM_USETHREADS */
3950         *managerp = manager;
3951
3952         return (ISC_R_SUCCESS);
3953
3954 cleanup:
3955 #ifdef ISC_PLATFORM_USETHREADS
3956         (void)close(manager->pipe_fds[0]);
3957         (void)close(manager->pipe_fds[1]);
3958 #endif  /* ISC_PLATFORM_USETHREADS */
3959
3960 #ifdef ISC_PLATFORM_USETHREADS
3961 cleanup_condition:
3962         (void)isc_condition_destroy(&manager->shutdown_ok);
3963 #endif  /* ISC_PLATFORM_USETHREADS */
3964
3965
3966 cleanup_lock:
3967         if (manager->fdlock != NULL) {
3968                 for (i = 0; i < FDLOCK_COUNT; i++)
3969                         DESTROYLOCK(&manager->fdlock[i]);
3970         }
3971         DESTROYLOCK(&manager->lock);
3972
3973 free_manager:
3974         if (manager->fdlock != NULL) {
3975                 isc_mem_put(mctx, manager->fdlock,
3976                             FDLOCK_COUNT * sizeof(isc_mutex_t));
3977         }
3978         if (manager->fdstate != NULL) {
3979                 isc_mem_put(mctx, manager->fdstate,
3980                             manager->maxsocks * sizeof(int));
3981         }
3982         if (manager->fds != NULL) {
3983                 isc_mem_put(mctx, manager->fds,
3984                             manager->maxsocks * sizeof(isc_socket_t *));
3985         }
3986         isc_mem_put(mctx, manager, sizeof(*manager));
3987
3988         return (result);
3989 }
3990
3991 isc_result_t
3992 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
3993         REQUIRE(VALID_MANAGER(manager));
3994         REQUIRE(nsockp != NULL);
3995
3996         *nsockp = manager->maxsocks;
3997
3998         return (ISC_R_SUCCESS);
3999 }
4000
4001 void
4002 isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
4003         REQUIRE(VALID_MANAGER(manager));
4004         REQUIRE(ISC_LIST_EMPTY(manager->socklist));
4005         REQUIRE(manager->stats == NULL);
4006         REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
4007
4008         isc_stats_attach(stats, &manager->stats);
4009 }
4010
4011 void
4012 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
4013         isc_socketmgr_t *manager;
4014         int i;
4015         isc_mem_t *mctx;
4016
4017         /*
4018          * Destroy a socket manager.
4019          */
4020
4021         REQUIRE(managerp != NULL);
4022         manager = *managerp;
4023         REQUIRE(VALID_MANAGER(manager));
4024
4025 #ifndef ISC_PLATFORM_USETHREADS
4026         if (manager->refs > 1) {
4027                 manager->refs--;
4028                 *managerp = NULL;
4029                 return;
4030         }
4031 #endif /* ISC_PLATFORM_USETHREADS */
4032
4033         LOCK(&manager->lock);
4034
4035 #ifdef ISC_PLATFORM_USETHREADS
4036         /*
4037          * Wait for all sockets to be destroyed.
4038          */
4039         while (!ISC_LIST_EMPTY(manager->socklist)) {
4040                 manager_log(manager, CREATION, "%s",
4041                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
4042                                            ISC_MSG_SOCKETSREMAIN,
4043                                            "sockets exist"));
4044                 WAIT(&manager->shutdown_ok, &manager->lock);
4045         }
4046 #else /* ISC_PLATFORM_USETHREADS */
4047         /*
4048          * Hope all sockets have been destroyed.
4049          */
4050         if (!ISC_LIST_EMPTY(manager->socklist)) {
4051                 manager_log(manager, CREATION, "%s",
4052                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
4053                                            ISC_MSG_SOCKETSREMAIN,
4054                                            "sockets exist"));
4055                 INSIST(0);
4056         }
4057 #endif /* ISC_PLATFORM_USETHREADS */
4058
4059         UNLOCK(&manager->lock);
4060
4061         /*
4062          * Here, poke our select/poll thread.  Do this by closing the write
4063          * half of the pipe, which will send EOF to the read half.
4064          * This is currently a no-op in the non-threaded case.
4065          */
4066         select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
4067
4068 #ifdef ISC_PLATFORM_USETHREADS
4069         /*
4070          * Wait for thread to exit.
4071          */
4072         if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
4073                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4074                                  "isc_thread_join() %s",
4075                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4076                                                 ISC_MSG_FAILED, "failed"));
4077 #endif /* ISC_PLATFORM_USETHREADS */
4078
4079         /*
4080          * Clean up.
4081          */
4082         cleanup_watcher(manager->mctx, manager);
4083
4084 #ifdef ISC_PLATFORM_USETHREADS
4085         (void)close(manager->pipe_fds[0]);
4086         (void)close(manager->pipe_fds[1]);
4087         (void)isc_condition_destroy(&manager->shutdown_ok);
4088 #endif /* ISC_PLATFORM_USETHREADS */
4089
4090         for (i = 0; i < (int)manager->maxsocks; i++)
4091                 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
4092                         (void)close(i);
4093
4094         isc_mem_put(manager->mctx, manager->fds,
4095                     manager->maxsocks * sizeof(isc_socket_t *));
4096         isc_mem_put(manager->mctx, manager->fdstate,
4097                     manager->maxsocks * sizeof(int));
4098
4099         if (manager->stats != NULL)
4100                 isc_stats_detach(&manager->stats);
4101
4102         if (manager->fdlock != NULL) {
4103                 for (i = 0; i < FDLOCK_COUNT; i++)
4104                         DESTROYLOCK(&manager->fdlock[i]);
4105                 isc_mem_put(manager->mctx, manager->fdlock,
4106                             FDLOCK_COUNT * sizeof(isc_mutex_t));
4107         }
4108         DESTROYLOCK(&manager->lock);
4109         manager->magic = 0;
4110         mctx= manager->mctx;
4111         isc_mem_put(mctx, manager, sizeof(*manager));
4112
4113         isc_mem_detach(&mctx);
4114
4115         *managerp = NULL;
4116 }
4117
4118 static isc_result_t
4119 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4120             unsigned int flags)
4121 {
4122         int io_state;
4123         isc_boolean_t have_lock = ISC_FALSE;
4124         isc_task_t *ntask = NULL;
4125         isc_result_t result = ISC_R_SUCCESS;
4126
4127         dev->ev_sender = task;
4128
4129         if (sock->type == isc_sockettype_udp) {
4130                 io_state = doio_recv(sock, dev);
4131         } else {
4132                 LOCK(&sock->lock);
4133                 have_lock = ISC_TRUE;
4134
4135                 if (ISC_LIST_EMPTY(sock->recv_list))
4136                         io_state = doio_recv(sock, dev);
4137                 else
4138                         io_state = DOIO_SOFT;
4139         }
4140
4141         switch (io_state) {
4142         case DOIO_SOFT:
4143                 /*
4144                  * We couldn't read all or part of the request right now, so
4145                  * queue it.
4146                  *
4147                  * Attach to socket and to task
4148                  */
4149                 isc_task_attach(task, &ntask);
4150                 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4151
4152                 if (!have_lock) {
4153                         LOCK(&sock->lock);
4154                         have_lock = ISC_TRUE;
4155                 }
4156
4157                 /*
4158                  * Enqueue the request.  If the socket was previously not being
4159                  * watched, poke the watcher to start paying attention to it.
4160                  */
4161                 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
4162                         select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
4163                 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
4164
4165                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
4166                            "socket_recv: event %p -> task %p",
4167                            dev, ntask);
4168
4169                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4170                         result = ISC_R_INPROGRESS;
4171                 break;
4172
4173         case DOIO_EOF:
4174                 dev->result = ISC_R_EOF;
4175                 /* fallthrough */
4176
4177         case DOIO_HARD:
4178         case DOIO_SUCCESS:
4179                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4180                         send_recvdone_event(sock, &dev);
4181                 break;
4182         }
4183
4184         if (have_lock)
4185                 UNLOCK(&sock->lock);
4186
4187         return (result);
4188 }
4189
4190 isc_result_t
4191 isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
4192                  unsigned int minimum, isc_task_t *task,
4193                  isc_taskaction_t action, const void *arg)
4194 {
4195         isc_socketevent_t *dev;
4196         isc_socketmgr_t *manager;
4197         unsigned int iocount;
4198         isc_buffer_t *buffer;
4199
4200         REQUIRE(VALID_SOCKET(sock));
4201         REQUIRE(buflist != NULL);
4202         REQUIRE(!ISC_LIST_EMPTY(*buflist));
4203         REQUIRE(task != NULL);
4204         REQUIRE(action != NULL);
4205
4206         manager = sock->manager;
4207         REQUIRE(VALID_MANAGER(manager));
4208
4209         iocount = isc_bufferlist_availablecount(buflist);
4210         REQUIRE(iocount > 0);
4211
4212         INSIST(sock->bound);
4213
4214         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4215         if (dev == NULL) {
4216                 return (ISC_R_NOMEMORY);
4217         }
4218
4219         /*
4220          * UDP sockets are always partial read
4221          */
4222         if (sock->type == isc_sockettype_udp)
4223                 dev->minimum = 1;
4224         else {
4225                 if (minimum == 0)
4226                         dev->minimum = iocount;
4227                 else
4228                         dev->minimum = minimum;
4229         }
4230
4231         /*
4232          * Move each buffer from the passed in list to our internal one.
4233          */
4234         buffer = ISC_LIST_HEAD(*buflist);
4235         while (buffer != NULL) {
4236                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
4237                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4238                 buffer = ISC_LIST_HEAD(*buflist);
4239         }
4240
4241         return (socket_recv(sock, dev, task, 0));
4242 }
4243
4244 isc_result_t
4245 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
4246                 isc_task_t *task, isc_taskaction_t action, const void *arg)
4247 {
4248         isc_socketevent_t *dev;
4249         isc_socketmgr_t *manager;
4250
4251         REQUIRE(VALID_SOCKET(sock));
4252         REQUIRE(action != NULL);
4253
4254         manager = sock->manager;
4255         REQUIRE(VALID_MANAGER(manager));
4256
4257         INSIST(sock->bound);
4258
4259         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4260         if (dev == NULL)
4261                 return (ISC_R_NOMEMORY);
4262
4263         return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
4264 }
4265
4266 isc_result_t
4267 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
4268                  unsigned int minimum, isc_task_t *task,
4269                  isc_socketevent_t *event, unsigned int flags)
4270 {
4271         event->ev_sender = sock;
4272         event->result = ISC_R_UNEXPECTED;
4273         ISC_LIST_INIT(event->bufferlist);
4274         event->region = *region;
4275         event->n = 0;
4276         event->offset = 0;
4277         event->attributes = 0;
4278
4279         /*
4280          * UDP sockets are always partial read.
4281          */
4282         if (sock->type == isc_sockettype_udp)
4283                 event->minimum = 1;
4284         else {
4285                 if (minimum == 0)
4286                         event->minimum = region->length;
4287                 else
4288                         event->minimum = minimum;
4289         }
4290
4291         return (socket_recv(sock, event, task, flags));
4292 }
4293
4294 static isc_result_t
4295 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4296             isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4297             unsigned int flags)
4298 {
4299         int io_state;
4300         isc_boolean_t have_lock = ISC_FALSE;
4301         isc_task_t *ntask = NULL;
4302         isc_result_t result = ISC_R_SUCCESS;
4303
4304         dev->ev_sender = task;
4305
4306         set_dev_address(address, sock, dev);
4307         if (pktinfo != NULL) {
4308                 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4309                 dev->pktinfo = *pktinfo;
4310
4311                 if (!isc_sockaddr_issitelocal(&dev->address) &&
4312                     !isc_sockaddr_islinklocal(&dev->address)) {
4313                         socket_log(sock, NULL, TRACE, isc_msgcat,
4314                                    ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
4315                                    "pktinfo structure provided, ifindex %u "
4316                                    "(set to 0)", pktinfo->ipi6_ifindex);
4317
4318                         /*
4319                          * Set the pktinfo index to 0 here, to let the
4320                          * kernel decide what interface it should send on.
4321                          */
4322                         dev->pktinfo.ipi6_ifindex = 0;
4323                 }
4324         }
4325
4326         if (sock->type == isc_sockettype_udp)
4327                 io_state = doio_send(sock, dev);
4328         else {
4329                 LOCK(&sock->lock);
4330                 have_lock = ISC_TRUE;
4331
4332                 if (ISC_LIST_EMPTY(sock->send_list))
4333                         io_state = doio_send(sock, dev);
4334                 else
4335                         io_state = DOIO_SOFT;
4336         }
4337
4338         switch (io_state) {
4339         case DOIO_SOFT:
4340                 /*
4341                  * We couldn't send all or part of the request right now, so
4342                  * queue it unless ISC_SOCKFLAG_NORETRY is set.
4343                  */
4344                 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4345                         isc_task_attach(task, &ntask);
4346                         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4347
4348                         if (!have_lock) {
4349                                 LOCK(&sock->lock);
4350                                 have_lock = ISC_TRUE;
4351                         }
4352
4353                         /*
4354                          * Enqueue the request.  If the socket was previously
4355                          * not being watched, poke the watcher to start
4356                          * paying attention to it.
4357                          */
4358                         if (ISC_LIST_EMPTY(sock->send_list) &&
4359                             !sock->pending_send)
4360                                 select_poke(sock->manager, sock->fd,
4361                                             SELECT_POKE_WRITE);
4362                         ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4363
4364                         socket_log(sock, NULL, EVENT, NULL, 0, 0,
4365                                    "socket_send: event %p -> task %p",
4366                                    dev, ntask);
4367
4368                         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4369                                 result = ISC_R_INPROGRESS;
4370                         break;
4371                 }
4372
4373         case DOIO_HARD:
4374         case DOIO_SUCCESS:
4375                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4376                         send_senddone_event(sock, &dev);
4377                 break;
4378         }
4379
4380         if (have_lock)
4381                 UNLOCK(&sock->lock);
4382
4383         return (result);
4384 }
4385
4386 isc_result_t
4387 isc_socket_send(isc_socket_t *sock, isc_region_t *region,
4388                 isc_task_t *task, isc_taskaction_t action, const void *arg)
4389 {
4390         /*
4391          * REQUIRE() checking is performed in isc_socket_sendto().
4392          */
4393         return (isc_socket_sendto(sock, region, task, action, arg, NULL,
4394                                   NULL));
4395 }
4396
4397 isc_result_t
4398 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
4399                   isc_task_t *task, isc_taskaction_t action, const void *arg,
4400                   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4401 {
4402         isc_socketevent_t *dev;
4403         isc_socketmgr_t *manager;
4404
4405         REQUIRE(VALID_SOCKET(sock));
4406         REQUIRE(region != NULL);
4407         REQUIRE(task != NULL);
4408         REQUIRE(action != NULL);
4409
4410         manager = sock->manager;
4411         REQUIRE(VALID_MANAGER(manager));
4412
4413         INSIST(sock->bound);
4414
4415         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4416         if (dev == NULL) {
4417                 return (ISC_R_NOMEMORY);
4418         }
4419
4420         dev->region = *region;
4421
4422         return (socket_send(sock, dev, task, address, pktinfo, 0));
4423 }
4424
4425 isc_result_t
4426 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
4427                  isc_task_t *task, isc_taskaction_t action, const void *arg)
4428 {
4429         return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
4430                                    NULL));
4431 }
4432
4433 isc_result_t
4434 isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
4435                    isc_task_t *task, isc_taskaction_t action, const void *arg,
4436                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4437 {
4438         isc_socketevent_t *dev;
4439         isc_socketmgr_t *manager;
4440         unsigned int iocount;
4441         isc_buffer_t *buffer;
4442
4443         REQUIRE(VALID_SOCKET(sock));
4444         REQUIRE(buflist != NULL);
4445         REQUIRE(!ISC_LIST_EMPTY(*buflist));
4446         REQUIRE(task != NULL);
4447         REQUIRE(action != NULL);
4448
4449         manager = sock->manager;
4450         REQUIRE(VALID_MANAGER(manager));
4451
4452         iocount = isc_bufferlist_usedcount(buflist);
4453         REQUIRE(iocount > 0);
4454
4455         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4456         if (dev == NULL) {
4457                 return (ISC_R_NOMEMORY);
4458         }
4459
4460         /*
4461          * Move each buffer from the passed in list to our internal one.
4462          */
4463         buffer = ISC_LIST_HEAD(*buflist);
4464         while (buffer != NULL) {
4465                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
4466                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4467                 buffer = ISC_LIST_HEAD(*buflist);
4468         }
4469
4470         return (socket_send(sock, dev, task, address, pktinfo, 0));
4471 }
4472
4473 isc_result_t
4474 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
4475                    isc_task_t *task,
4476                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4477                    isc_socketevent_t *event, unsigned int flags)
4478 {
4479         REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
4480         if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
4481                 REQUIRE(sock->type == isc_sockettype_udp);
4482         event->ev_sender = sock;
4483         event->result = ISC_R_UNEXPECTED;
4484         ISC_LIST_INIT(event->bufferlist);
4485         event->region = *region;
4486         event->n = 0;
4487         event->offset = 0;
4488         event->attributes = 0;
4489
4490         return (socket_send(sock, event, task, address, pktinfo, flags));
4491 }
4492
4493 void
4494 isc_socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
4495 #ifdef ISC_PLATFORM_HAVESYSUNH
4496         int s;
4497         struct stat sb;
4498         char strbuf[ISC_STRERRORSIZE];
4499
4500         if (sockaddr->type.sa.sa_family != AF_UNIX)
4501                 return;
4502
4503 #ifndef S_ISSOCK
4504 #if defined(S_IFMT) && defined(S_IFSOCK)
4505 #define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
4506 #elif defined(_S_IFMT) && defined(S_IFSOCK)
4507 #define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
4508 #endif
4509 #endif
4510
4511 #ifndef S_ISFIFO
4512 #if defined(S_IFMT) && defined(S_IFIFO)
4513 #define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
4514 #elif defined(_S_IFMT) && defined(S_IFIFO)
4515 #define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
4516 #endif
4517 #endif
4518
4519 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4520 #error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4521 #endif
4522
4523 #ifndef S_ISFIFO
4524 #define S_ISFIFO(mode) 0
4525 #endif
4526
4527 #ifndef S_ISSOCK
4528 #define S_ISSOCK(mode) 0
4529 #endif
4530
4531         if (active) {
4532                 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4533                         isc__strerror(errno, strbuf, sizeof(strbuf));
4534                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4535                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4536                                       "isc_socket_cleanunix: stat(%s): %s",
4537                                       sockaddr->type.sunix.sun_path, strbuf);
4538                         return;
4539                 }
4540                 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4541                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4542                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4543                                       "isc_socket_cleanunix: %s: not a socket",
4544                                       sockaddr->type.sunix.sun_path);
4545                         return;
4546                 }
4547                 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4548                         isc__strerror(errno, strbuf, sizeof(strbuf));
4549                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4550                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4551                                       "isc_socket_cleanunix: unlink(%s): %s",
4552                                       sockaddr->type.sunix.sun_path, strbuf);
4553                 }
4554                 return;
4555         }
4556
4557         s = socket(AF_UNIX, SOCK_STREAM, 0);
4558         if (s < 0) {
4559                 isc__strerror(errno, strbuf, sizeof(strbuf));
4560                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4561                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4562                               "isc_socket_cleanunix: socket(%s): %s",
4563                               sockaddr->type.sunix.sun_path, strbuf);
4564                 return;
4565         }
4566
4567         if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4568                 switch (errno) {
4569                 case ENOENT:    /* We exited cleanly last time */
4570                         break;
4571                 default:
4572                         isc__strerror(errno, strbuf, sizeof(strbuf));
4573                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4574                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4575                                       "isc_socket_cleanunix: stat(%s): %s",
4576                                       sockaddr->type.sunix.sun_path, strbuf);
4577                         break;
4578                 }
4579                 goto cleanup;
4580         }
4581
4582         if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4583                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4584                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4585                               "isc_socket_cleanunix: %s: not a socket",
4586                               sockaddr->type.sunix.sun_path);
4587                 goto cleanup;
4588         }
4589
4590         if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
4591                     sizeof(sockaddr->type.sunix)) < 0) {
4592                 switch (errno) {
4593                 case ECONNREFUSED:
4594                 case ECONNRESET:
4595                         if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4596                                 isc__strerror(errno, strbuf, sizeof(strbuf));
4597                                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4598                                               ISC_LOGMODULE_SOCKET,
4599                                               ISC_LOG_WARNING,
4600                                               "isc_socket_cleanunix: "
4601                                               "unlink(%s): %s",
4602                                               sockaddr->type.sunix.sun_path,
4603                                               strbuf);
4604                         }
4605                         break;
4606                 default:
4607                         isc__strerror(errno, strbuf, sizeof(strbuf));
4608                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4609                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4610                                       "isc_socket_cleanunix: connect(%s): %s",
4611                                       sockaddr->type.sunix.sun_path, strbuf);
4612                         break;
4613                 }
4614         }
4615  cleanup:
4616         close(s);
4617 #else
4618         UNUSED(sockaddr);
4619         UNUSED(active);
4620 #endif
4621 }
4622
4623 isc_result_t
4624 isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
4625                     isc_uint32_t owner, isc_uint32_t group)
4626 {
4627 #ifdef ISC_PLATFORM_HAVESYSUNH
4628         isc_result_t result = ISC_R_SUCCESS;
4629         char strbuf[ISC_STRERRORSIZE];
4630         char path[sizeof(sockaddr->type.sunix.sun_path)];
4631 #ifdef NEED_SECURE_DIRECTORY
4632         char *slash;
4633 #endif
4634
4635         REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4636         INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4637         strcpy(path, sockaddr->type.sunix.sun_path);
4638
4639 #ifdef NEED_SECURE_DIRECTORY
4640         slash = strrchr(path, '/');
4641         if (slash != NULL) {
4642                 if (slash != path)
4643                         *slash = '\0';
4644                 else
4645                         strcpy(path, "/");
4646         } else
4647                 strcpy(path, ".");
4648 #endif
4649
4650         if (chmod(path, perm) < 0) {
4651                 isc__strerror(errno, strbuf, sizeof(strbuf));
4652                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4653                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4654                               "isc_socket_permunix: chmod(%s, %d): %s",
4655                               path, perm, strbuf);
4656                 result = ISC_R_FAILURE;
4657         }
4658         if (chown(path, owner, group) < 0) {
4659                 isc__strerror(errno, strbuf, sizeof(strbuf));
4660                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4661                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4662                               "isc_socket_permunix: chown(%s, %d, %d): %s",
4663                               path, owner, group,
4664                               strbuf);
4665                 result = ISC_R_FAILURE;
4666         }
4667         return (result);
4668 #else
4669         UNUSED(sockaddr);
4670         UNUSED(perm);
4671         UNUSED(owner);
4672         UNUSED(group);
4673         return (ISC_R_NOTIMPLEMENTED);
4674 #endif
4675 }
4676
4677 isc_result_t
4678 isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
4679                 unsigned int options) {
4680         char strbuf[ISC_STRERRORSIZE];
4681         int on = 1;
4682
4683         LOCK(&sock->lock);
4684
4685         INSIST(!sock->bound);
4686
4687         if (sock->pf != sockaddr->type.sa.sa_family) {
4688                 UNLOCK(&sock->lock);
4689                 return (ISC_R_FAMILYMISMATCH);
4690         }
4691         /*
4692          * Only set SO_REUSEADDR when we want a specific port.
4693          */
4694 #ifdef AF_UNIX
4695         if (sock->pf == AF_UNIX)
4696                 goto bind_socket;
4697 #endif
4698         if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4699             isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
4700             setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4701                        sizeof(on)) < 0) {
4702                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4703                                  "setsockopt(%d) %s", sock->fd,
4704                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4705                                                 ISC_MSG_FAILED, "failed"));
4706                 /* Press on... */
4707         }
4708 #ifdef AF_UNIX
4709  bind_socket:
4710 #endif
4711         if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
4712                 inc_stats(sock->manager->stats,
4713                           sock->statsindex[STATID_BINDFAIL]);
4714
4715                 UNLOCK(&sock->lock);
4716                 switch (errno) {
4717                 case EACCES:
4718                         return (ISC_R_NOPERM);
4719                 case EADDRNOTAVAIL:
4720                         return (ISC_R_ADDRNOTAVAIL);
4721                 case EADDRINUSE:
4722                         return (ISC_R_ADDRINUSE);
4723                 case EINVAL:
4724                         return (ISC_R_BOUND);
4725                 default:
4726                         isc__strerror(errno, strbuf, sizeof(strbuf));
4727                         UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
4728                                          strbuf);
4729                         return (ISC_R_UNEXPECTED);
4730                 }
4731         }
4732
4733         socket_log(sock, sockaddr, TRACE,
4734                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
4735         sock->bound = 1;
4736
4737         UNLOCK(&sock->lock);
4738         return (ISC_R_SUCCESS);
4739 }
4740
4741 /*
4742  * Enable this only for specific OS versions, and only when they have repaired
4743  * their problems with it.  Until then, this is is broken and needs to be
4744  * diabled by default.  See RT22589 for details.
4745  */
4746 #undef ENABLE_ACCEPTFILTER
4747
4748 isc_result_t
4749 isc_socket_filter(isc_socket_t *sock, const char *filter) {
4750 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4751         char strbuf[ISC_STRERRORSIZE];
4752         struct accept_filter_arg afa;
4753 #else
4754         UNUSED(sock);
4755         UNUSED(filter);
4756 #endif
4757
4758         REQUIRE(VALID_SOCKET(sock));
4759
4760 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4761         bzero(&afa, sizeof(afa));
4762         strncpy(afa.af_name, filter, sizeof(afa.af_name));
4763         if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
4764                          &afa, sizeof(afa)) == -1) {
4765                 isc__strerror(errno, strbuf, sizeof(strbuf));
4766                 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
4767                            ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
4768                            strbuf);
4769                 return (ISC_R_FAILURE);
4770         }
4771         return (ISC_R_SUCCESS);
4772 #else
4773         return (ISC_R_NOTIMPLEMENTED);
4774 #endif
4775 }
4776
4777 /*
4778  * Set up to listen on a given socket.  We do this by creating an internal
4779  * event that will be dispatched when the socket has read activity.  The
4780  * watcher will send the internal event to the task when there is a new
4781  * connection.
4782  *
4783  * Unlike in read, we don't preallocate a done event here.  Every time there
4784  * is a new connection we'll have to allocate a new one anyway, so we might
4785  * as well keep things simple rather than having to track them.
4786  */
4787 isc_result_t
4788 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
4789         char strbuf[ISC_STRERRORSIZE];
4790
4791         REQUIRE(VALID_SOCKET(sock));
4792
4793         LOCK(&sock->lock);
4794
4795         REQUIRE(!sock->listener);
4796         REQUIRE(sock->bound);
4797         REQUIRE(sock->type == isc_sockettype_tcp ||
4798                 sock->type == isc_sockettype_unix);
4799
4800         if (backlog == 0)
4801                 backlog = SOMAXCONN;
4802
4803         if (listen(sock->fd, (int)backlog) < 0) {
4804                 UNLOCK(&sock->lock);
4805                 isc__strerror(errno, strbuf, sizeof(strbuf));
4806
4807                 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
4808
4809                 return (ISC_R_UNEXPECTED);
4810         }
4811
4812         sock->listener = 1;
4813
4814         UNLOCK(&sock->lock);
4815         return (ISC_R_SUCCESS);
4816 }
4817
4818 /*
4819  * This should try to do aggressive accept() XXXMLG
4820  */
4821 isc_result_t
4822 isc_socket_accept(isc_socket_t *sock,
4823                   isc_task_t *task, isc_taskaction_t action, const void *arg)
4824 {
4825         isc_socket_newconnev_t *dev;
4826         isc_socketmgr_t *manager;
4827         isc_task_t *ntask = NULL;
4828         isc_socket_t *nsock;
4829         isc_result_t result;
4830         isc_boolean_t do_poke = ISC_FALSE;
4831
4832         REQUIRE(VALID_SOCKET(sock));
4833         manager = sock->manager;
4834         REQUIRE(VALID_MANAGER(manager));
4835
4836         LOCK(&sock->lock);
4837
4838         REQUIRE(sock->listener);
4839
4840         /*
4841          * Sender field is overloaded here with the task we will be sending
4842          * this event to.  Just before the actual event is delivered the
4843          * actual ev_sender will be touched up to be the socket.
4844          */
4845         dev = (isc_socket_newconnev_t *)
4846                 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
4847                                    action, arg, sizeof(*dev));
4848         if (dev == NULL) {
4849                 UNLOCK(&sock->lock);
4850                 return (ISC_R_NOMEMORY);
4851         }
4852         ISC_LINK_INIT(dev, ev_link);
4853
4854         result = allocate_socket(manager, sock->type, &nsock);
4855         if (result != ISC_R_SUCCESS) {
4856                 isc_event_free(ISC_EVENT_PTR(&dev));
4857                 UNLOCK(&sock->lock);
4858                 return (result);
4859         }
4860
4861         /*
4862          * Attach to socket and to task.
4863          */
4864         isc_task_attach(task, &ntask);
4865         if (isc_task_exiting(ntask)) {
4866                 isc_task_detach(&ntask);
4867                 isc_event_free(ISC_EVENT_PTR(&dev));
4868                 UNLOCK(&sock->lock);
4869                 return (ISC_R_SHUTTINGDOWN);
4870         }
4871         nsock->references++;
4872         nsock->statsindex = sock->statsindex;
4873
4874         dev->ev_sender = ntask;
4875         dev->newsocket = nsock;
4876
4877         /*
4878          * Poke watcher here.  We still have the socket locked, so there
4879          * is no race condition.  We will keep the lock for such a short
4880          * bit of time waking it up now or later won't matter all that much.
4881          */
4882         if (ISC_LIST_EMPTY(sock->accept_list))
4883                 do_poke = ISC_TRUE;
4884
4885         ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
4886
4887         if (do_poke)
4888                 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
4889
4890         UNLOCK(&sock->lock);
4891         return (ISC_R_SUCCESS);
4892 }
4893
4894 isc_result_t
4895 isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
4896                    isc_task_t *task, isc_taskaction_t action, const void *arg)
4897 {
4898         isc_socket_connev_t *dev;
4899         isc_task_t *ntask = NULL;
4900         isc_socketmgr_t *manager;
4901         int cc;
4902         char strbuf[ISC_STRERRORSIZE];
4903         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
4904
4905         REQUIRE(VALID_SOCKET(sock));
4906         REQUIRE(addr != NULL);
4907         REQUIRE(task != NULL);
4908         REQUIRE(action != NULL);
4909
4910         manager = sock->manager;
4911         REQUIRE(VALID_MANAGER(manager));
4912         REQUIRE(addr != NULL);
4913
4914         if (isc_sockaddr_ismulticast(addr))
4915                 return (ISC_R_MULTICAST);
4916
4917         LOCK(&sock->lock);
4918
4919         REQUIRE(!sock->connecting);
4920
4921         dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
4922                                                         ISC_SOCKEVENT_CONNECT,
4923                                                         action, arg,
4924                                                         sizeof(*dev));
4925         if (dev == NULL) {
4926                 UNLOCK(&sock->lock);
4927                 return (ISC_R_NOMEMORY);
4928         }
4929         ISC_LINK_INIT(dev, ev_link);
4930
4931         /*
4932          * Try to do the connect right away, as there can be only one
4933          * outstanding, and it might happen to complete.
4934          */
4935         sock->peer_address = *addr;
4936         cc = connect(sock->fd, &addr->type.sa, addr->length);
4937         if (cc < 0) {
4938                 /*
4939                  * HP-UX "fails" to connect a UDP socket and sets errno to
4940                  * EINPROGRESS if it's non-blocking.  We'd rather regard this as
4941                  * a success and let the user detect it if it's really an error
4942                  * at the time of sending a packet on the socket.
4943                  */
4944                 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
4945                         cc = 0;
4946                         goto success;
4947                 }
4948                 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
4949                         goto queue;
4950
4951                 switch (errno) {
4952 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
4953                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
4954                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4955                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4956                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4957                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4958 #ifdef EHOSTDOWN
4959                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4960 #endif
4961                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4962                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4963                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4964                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4965                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4966 #undef ERROR_MATCH
4967                 }
4968
4969                 sock->connected = 0;
4970
4971                 isc__strerror(errno, strbuf, sizeof(strbuf));
4972                 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
4973                 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
4974                                  addrbuf, errno, strbuf);
4975
4976                 UNLOCK(&sock->lock);
4977                 inc_stats(sock->manager->stats,
4978                           sock->statsindex[STATID_CONNECTFAIL]);
4979                 isc_event_free(ISC_EVENT_PTR(&dev));
4980                 return (ISC_R_UNEXPECTED);
4981
4982         err_exit:
4983                 sock->connected = 0;
4984                 isc_task_send(task, ISC_EVENT_PTR(&dev));
4985
4986                 UNLOCK(&sock->lock);
4987                 inc_stats(sock->manager->stats,
4988                           sock->statsindex[STATID_CONNECTFAIL]);
4989                 return (ISC_R_SUCCESS);
4990         }
4991
4992         /*
4993          * If connect completed, fire off the done event.
4994          */
4995  success:
4996         if (cc == 0) {
4997                 sock->connected = 1;
4998                 sock->bound = 1;
4999                 dev->result = ISC_R_SUCCESS;
5000                 isc_task_send(task, ISC_EVENT_PTR(&dev));
5001
5002                 UNLOCK(&sock->lock);
5003
5004                 inc_stats(sock->manager->stats,
5005                           sock->statsindex[STATID_CONNECT]);
5006
5007                 return (ISC_R_SUCCESS);
5008         }
5009
5010  queue:
5011
5012         /*
5013          * Attach to task.
5014          */
5015         isc_task_attach(task, &ntask);
5016
5017         sock->connecting = 1;
5018
5019         dev->ev_sender = ntask;
5020
5021         /*
5022          * Poke watcher here.  We still have the socket locked, so there
5023          * is no race condition.  We will keep the lock for such a short
5024          * bit of time waking it up now or later won't matter all that much.
5025          */
5026         if (sock->connect_ev == NULL)
5027                 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
5028
5029         sock->connect_ev = dev;
5030
5031         UNLOCK(&sock->lock);
5032         return (ISC_R_SUCCESS);
5033 }
5034
5035 /*
5036  * Called when a socket with a pending connect() finishes.
5037  */
5038 static void
5039 internal_connect(isc_task_t *me, isc_event_t *ev) {
5040         isc_socket_t *sock;
5041         isc_socket_connev_t *dev;
5042         isc_task_t *task;
5043         int cc;
5044         ISC_SOCKADDR_LEN_T optlen;
5045         char strbuf[ISC_STRERRORSIZE];
5046         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5047
5048         UNUSED(me);
5049         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
5050
5051         sock = ev->ev_sender;
5052         INSIST(VALID_SOCKET(sock));
5053
5054         LOCK(&sock->lock);
5055
5056         /*
5057          * When the internal event was sent the reference count was bumped
5058          * to keep the socket around for us.  Decrement the count here.
5059          */
5060         INSIST(sock->references > 0);
5061         sock->references--;
5062         if (sock->references == 0) {
5063                 UNLOCK(&sock->lock);
5064                 destroy(&sock);
5065                 return;
5066         }
5067
5068         /*
5069          * Has this event been canceled?
5070          */
5071         dev = sock->connect_ev;
5072         if (dev == NULL) {
5073                 INSIST(!sock->connecting);
5074                 UNLOCK(&sock->lock);
5075                 return;
5076         }
5077
5078         INSIST(sock->connecting);
5079         sock->connecting = 0;
5080
5081         /*
5082          * Get any possible error status here.
5083          */
5084         optlen = sizeof(cc);
5085         if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
5086                        (void *)&cc, (void *)&optlen) < 0)
5087                 cc = errno;
5088         else
5089                 errno = cc;
5090
5091         if (errno != 0) {
5092                 /*
5093                  * If the error is EAGAIN, just re-select on this
5094                  * fd and pretend nothing strange happened.
5095                  */
5096                 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
5097                         sock->connecting = 1;
5098                         select_poke(sock->manager, sock->fd,
5099                                     SELECT_POKE_CONNECT);
5100                         UNLOCK(&sock->lock);
5101
5102                         return;
5103                 }
5104
5105                 inc_stats(sock->manager->stats,
5106                           sock->statsindex[STATID_CONNECTFAIL]);
5107
5108                 /*
5109                  * Translate other errors into ISC_R_* flavors.
5110                  */
5111                 switch (errno) {
5112 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
5113                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
5114                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
5115                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
5116                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
5117                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
5118 #ifdef EHOSTDOWN
5119                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
5120 #endif
5121                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
5122                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
5123                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
5124                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
5125                         ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
5126                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5127 #undef ERROR_MATCH
5128                 default:
5129                         dev->result = ISC_R_UNEXPECTED;
5130                         isc_sockaddr_format(&sock->peer_address, peerbuf,
5131                                             sizeof(peerbuf));
5132                         isc__strerror(errno, strbuf, sizeof(strbuf));
5133                         UNEXPECTED_ERROR(__FILE__, __LINE__,
5134                                          "internal_connect: connect(%s) %s",
5135                                          peerbuf, strbuf);
5136                 }
5137         } else {
5138                 inc_stats(sock->manager->stats,
5139                           sock->statsindex[STATID_CONNECT]);
5140                 dev->result = ISC_R_SUCCESS;
5141                 sock->connected = 1;
5142                 sock->bound = 1;
5143         }
5144
5145         sock->connect_ev = NULL;
5146
5147         UNLOCK(&sock->lock);
5148
5149         task = dev->ev_sender;
5150         dev->ev_sender = sock;
5151         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
5152 }
5153
5154 isc_result_t
5155 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5156         isc_result_t result;
5157
5158         REQUIRE(VALID_SOCKET(sock));
5159         REQUIRE(addressp != NULL);
5160
5161         LOCK(&sock->lock);
5162
5163         if (sock->connected) {
5164                 *addressp = sock->peer_address;
5165                 result = ISC_R_SUCCESS;
5166         } else {
5167                 result = ISC_R_NOTCONNECTED;
5168         }
5169
5170         UNLOCK(&sock->lock);
5171
5172         return (result);
5173 }
5174
5175 isc_result_t
5176 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5177         ISC_SOCKADDR_LEN_T len;
5178         isc_result_t result;
5179         char strbuf[ISC_STRERRORSIZE];
5180
5181         REQUIRE(VALID_SOCKET(sock));
5182         REQUIRE(addressp != NULL);
5183
5184         LOCK(&sock->lock);
5185
5186         if (!sock->bound) {
5187                 result = ISC_R_NOTBOUND;
5188                 goto out;
5189         }
5190
5191         result = ISC_R_SUCCESS;
5192
5193         len = sizeof(addressp->type);
5194         if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
5195                 isc__strerror(errno, strbuf, sizeof(strbuf));
5196                 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
5197                                  strbuf);
5198                 result = ISC_R_UNEXPECTED;
5199                 goto out;
5200         }
5201         addressp->length = (unsigned int)len;
5202
5203  out:
5204         UNLOCK(&sock->lock);
5205
5206         return (result);
5207 }
5208
5209 /*
5210  * Run through the list of events on this socket, and cancel the ones
5211  * queued for task "task" of type "how".  "how" is a bitmask.
5212  */
5213 void
5214 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
5215
5216         REQUIRE(VALID_SOCKET(sock));
5217
5218         /*
5219          * Quick exit if there is nothing to do.  Don't even bother locking
5220          * in this case.
5221          */
5222         if (how == 0)
5223                 return;
5224
5225         LOCK(&sock->lock);
5226
5227         /*
5228          * All of these do the same thing, more or less.
5229          * Each will:
5230          *      o If the internal event is marked as "posted" try to
5231          *        remove it from the task's queue.  If this fails, mark it
5232          *        as canceled instead, and let the task clean it up later.
5233          *      o For each I/O request for that task of that type, post
5234          *        its done event with status of "ISC_R_CANCELED".
5235          *      o Reset any state needed.
5236          */
5237         if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
5238             && !ISC_LIST_EMPTY(sock->recv_list)) {
5239                 isc_socketevent_t      *dev;
5240                 isc_socketevent_t      *next;
5241                 isc_task_t             *current_task;
5242
5243                 dev = ISC_LIST_HEAD(sock->recv_list);
5244
5245                 while (dev != NULL) {
5246                         current_task = dev->ev_sender;
5247                         next = ISC_LIST_NEXT(dev, ev_link);
5248
5249                         if ((task == NULL) || (task == current_task)) {
5250                                 dev->result = ISC_R_CANCELED;
5251                                 send_recvdone_event(sock, &dev);
5252                         }
5253                         dev = next;
5254                 }
5255         }
5256
5257         if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
5258             && !ISC_LIST_EMPTY(sock->send_list)) {
5259                 isc_socketevent_t      *dev;
5260                 isc_socketevent_t      *next;
5261                 isc_task_t             *current_task;
5262
5263                 dev = ISC_LIST_HEAD(sock->send_list);
5264
5265                 while (dev != NULL) {
5266                         current_task = dev->ev_sender;
5267                         next = ISC_LIST_NEXT(dev, ev_link);
5268
5269                         if ((task == NULL) || (task == current_task)) {
5270                                 dev->result = ISC_R_CANCELED;
5271                                 send_senddone_event(sock, &dev);
5272                         }
5273                         dev = next;
5274                 }
5275         }
5276
5277         if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
5278             && !ISC_LIST_EMPTY(sock->accept_list)) {
5279                 isc_socket_newconnev_t *dev;
5280                 isc_socket_newconnev_t *next;
5281                 isc_task_t             *current_task;
5282
5283                 dev = ISC_LIST_HEAD(sock->accept_list);
5284                 while (dev != NULL) {
5285                         current_task = dev->ev_sender;
5286                         next = ISC_LIST_NEXT(dev, ev_link);
5287
5288                         if ((task == NULL) || (task == current_task)) {
5289
5290                                 ISC_LIST_UNLINK(sock->accept_list, dev,
5291                                                 ev_link);
5292
5293                                 dev->newsocket->references--;
5294                                 free_socket(&dev->newsocket);
5295
5296                                 dev->result = ISC_R_CANCELED;
5297                                 dev->ev_sender = sock;
5298                                 isc_task_sendanddetach(&current_task,
5299                                                        ISC_EVENT_PTR(&dev));
5300                         }
5301
5302                         dev = next;
5303                 }
5304         }
5305
5306         /*
5307          * Connecting is not a list.
5308          */
5309         if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
5310             && sock->connect_ev != NULL) {
5311                 isc_socket_connev_t    *dev;
5312                 isc_task_t             *current_task;
5313
5314                 INSIST(sock->connecting);
5315                 sock->connecting = 0;
5316
5317                 dev = sock->connect_ev;
5318                 current_task = dev->ev_sender;
5319
5320                 if ((task == NULL) || (task == current_task)) {
5321                         sock->connect_ev = NULL;
5322
5323                         dev->result = ISC_R_CANCELED;
5324                         dev->ev_sender = sock;
5325                         isc_task_sendanddetach(&current_task,
5326                                                ISC_EVENT_PTR(&dev));
5327                 }
5328         }
5329
5330         UNLOCK(&sock->lock);
5331 }
5332
5333 isc_sockettype_t
5334 isc_socket_gettype(isc_socket_t *sock) {
5335         REQUIRE(VALID_SOCKET(sock));
5336
5337         return (sock->type);
5338 }
5339
5340 isc_boolean_t
5341 isc_socket_isbound(isc_socket_t *sock) {
5342         isc_boolean_t val;
5343
5344         LOCK(&sock->lock);
5345         val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
5346         UNLOCK(&sock->lock);
5347
5348         return (val);
5349 }
5350
5351 void
5352 isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
5353 #if defined(IPV6_V6ONLY)
5354         int onoff = yes ? 1 : 0;
5355 #else
5356         UNUSED(yes);
5357         UNUSED(sock);
5358 #endif
5359
5360         REQUIRE(VALID_SOCKET(sock));
5361
5362 #ifdef IPV6_V6ONLY
5363         if (sock->pf == AF_INET6) {
5364                 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5365                                (void *)&onoff, sizeof(int)) < 0) {
5366                         char strbuf[ISC_STRERRORSIZE];
5367
5368                         UNEXPECTED_ERROR(__FILE__, __LINE__,
5369                                          "setsockopt(%d, IPV6_V6ONLY) "
5370                                          "%s: %s", sock->fd,
5371                                          isc_msgcat_get(isc_msgcat,
5372                                                         ISC_MSGSET_GENERAL,
5373                                                         ISC_MSG_FAILED,
5374                                                         "failed"),
5375                                          strbuf);
5376                 }
5377         }
5378         FIX_IPV6_RECVPKTINFO(sock);     /* AIX */
5379 #endif
5380 }
5381
5382 #ifndef ISC_PLATFORM_USETHREADS
5383 /* In our assumed scenario, we can simply use a single static object. */
5384 static isc_socketwait_t swait_private;
5385
5386 int
5387 isc__socketmgr_waitevents(struct timeval *tvp, isc_socketwait_t **swaitp) {
5388         int n;
5389 #ifdef USE_KQUEUE
5390         struct timespec ts, *tsp;
5391 #endif
5392 #ifdef USE_EPOLL
5393         int timeout;
5394 #endif
5395 #ifdef USE_DEVPOLL
5396         struct dvpoll dvp;
5397 #endif
5398
5399         REQUIRE(swaitp != NULL && *swaitp == NULL);
5400
5401         if (socketmgr == NULL)
5402                 return (0);
5403
5404 #ifdef USE_KQUEUE
5405         if (tvp != NULL) {
5406                 ts.tv_sec = tvp->tv_sec;
5407                 ts.tv_nsec = tvp->tv_usec * 1000;
5408                 tsp = &ts;
5409         } else
5410                 tsp = NULL;
5411         swait_private.nevents = kevent(socketmgr->kqueue_fd, NULL, 0,
5412                                        socketmgr->events, socketmgr->nevents,
5413                                        tsp);
5414         n = swait_private.nevents;
5415 #elif defined(USE_EPOLL)
5416         if (tvp != NULL)
5417                 timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000;
5418         else
5419                 timeout = -1;
5420         swait_private.nevents = epoll_wait(socketmgr->epoll_fd,
5421                                            socketmgr->events,
5422                                            socketmgr->nevents, timeout);
5423         n = swait_private.nevents;
5424 #elif defined(USE_DEVPOLL)
5425         dvp.dp_fds = socketmgr->events;
5426         dvp.dp_nfds = socketmgr->nevents;
5427         if (tvp != NULL) {
5428                 dvp.dp_timeout = tvp->tv_sec * 1000 +
5429                         (tvp->tv_usec + 999) / 1000;
5430         } else
5431                 dvp.dp_timeout = -1;
5432         swait_private.nevents = ioctl(socketmgr->devpoll_fd, DP_POLL, &dvp);
5433         n = swait_private.nevents;
5434 #elif defined(USE_SELECT)
5435         memcpy(socketmgr->read_fds_copy, socketmgr->read_fds,
5436                socketmgr->fd_bufsize);
5437         memcpy(socketmgr->write_fds_copy, socketmgr->write_fds,
5438                socketmgr->fd_bufsize);
5439
5440         swait_private.readset = socketmgr->read_fds_copy;
5441         swait_private.writeset = socketmgr->write_fds_copy;
5442         swait_private.maxfd = socketmgr->maxfd + 1;
5443
5444         n = select(swait_private.maxfd, swait_private.readset,
5445                    swait_private.writeset, NULL, tvp);
5446 #endif
5447
5448         *swaitp = &swait_private;
5449         return (n);
5450 }
5451
5452 isc_result_t
5453 isc__socketmgr_dispatch(isc_socketwait_t *swait) {
5454         REQUIRE(swait == &swait_private);
5455
5456         if (socketmgr == NULL)
5457                 return (ISC_R_NOTFOUND);
5458
5459 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
5460         (void)process_fds(socketmgr, socketmgr->events, swait->nevents);
5461         return (ISC_R_SUCCESS);
5462 #elif defined(USE_SELECT)
5463         process_fds(socketmgr, swait->maxfd, swait->readset, swait->writeset);
5464         return (ISC_R_SUCCESS);
5465 #endif
5466 }
5467 #endif /* ISC_PLATFORM_USETHREADS */
5468
5469 void
5470 isc_socket_setname(isc_socket_t *socket, const char *name, void *tag) {
5471
5472         /*
5473          * Name 'socket'.
5474          */
5475
5476         REQUIRE(VALID_SOCKET(socket));
5477
5478         LOCK(&socket->lock);
5479         memset(socket->name, 0, sizeof(socket->name));
5480         strncpy(socket->name, name, sizeof(socket->name) - 1);
5481         socket->tag = tag;
5482         UNLOCK(&socket->lock);
5483 }
5484
5485 const char *
5486 isc_socket_getname(isc_socket_t *socket) {
5487         return (socket->name);
5488 }
5489
5490 void *
5491 isc_socket_gettag(isc_socket_t *socket) {
5492         return (socket->tag);
5493 }
5494
5495 #ifdef HAVE_LIBXML2
5496
5497 static const char *
5498 _socktype(isc_sockettype_t type)
5499 {
5500         if (type == isc_sockettype_udp)
5501                 return ("udp");
5502         else if (type == isc_sockettype_tcp)
5503                 return ("tcp");
5504         else if (type == isc_sockettype_unix)
5505                 return ("unix");
5506         else if (type == isc_sockettype_fdwatch)
5507                 return ("fdwatch");
5508         else
5509                 return ("not-initialized");
5510 }
5511
5512 void
5513 isc_socketmgr_renderxml(isc_socketmgr_t *mgr, xmlTextWriterPtr writer)
5514 {
5515         isc_socket_t *sock;
5516         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5517         isc_sockaddr_t addr;
5518         ISC_SOCKADDR_LEN_T len;
5519
5520         LOCK(&mgr->lock);
5521
5522 #ifndef ISC_PLATFORM_USETHREADS
5523         xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5524         xmlTextWriterWriteFormatString(writer, "%d", mgr->refs);
5525         xmlTextWriterEndElement(writer);
5526 #endif
5527
5528         xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets");
5529         sock = ISC_LIST_HEAD(mgr->socklist);
5530         while (sock != NULL) {
5531                 LOCK(&sock->lock);
5532                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket");
5533
5534                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "id");
5535                 xmlTextWriterWriteFormatString(writer, "%p", sock);
5536                 xmlTextWriterEndElement(writer);
5537
5538                 if (sock->name[0] != 0) {
5539                         xmlTextWriterStartElement(writer, ISC_XMLCHAR "name");
5540                         xmlTextWriterWriteFormatString(writer, "%s",
5541                                                        sock->name);
5542                         xmlTextWriterEndElement(writer); /* name */
5543                 }
5544
5545                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5546                 xmlTextWriterWriteFormatString(writer, "%d", sock->references);
5547                 xmlTextWriterEndElement(writer);
5548
5549                 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
5550                                           ISC_XMLCHAR _socktype(sock->type));
5551
5552                 if (sock->connected) {
5553                         isc_sockaddr_format(&sock->peer_address, peerbuf,
5554                                             sizeof(peerbuf));
5555                         xmlTextWriterWriteElement(writer,
5556                                                   ISC_XMLCHAR "peer-address",
5557                                                   ISC_XMLCHAR peerbuf);
5558                 }
5559
5560                 len = sizeof(addr);
5561                 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5562                         isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5563                         xmlTextWriterWriteElement(writer,
5564                                                   ISC_XMLCHAR "local-address",
5565                                                   ISC_XMLCHAR peerbuf);
5566                 }
5567
5568                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "states");
5569                 if (sock->pending_recv)
5570                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5571                                                 ISC_XMLCHAR "pending-receive");
5572                 if (sock->pending_send)
5573                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5574                                                   ISC_XMLCHAR "pending-send");
5575                 if (sock->pending_accept)
5576                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5577                                                  ISC_XMLCHAR "pending_accept");
5578                 if (sock->listener)
5579                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5580                                                   ISC_XMLCHAR "listener");
5581                 if (sock->connected)
5582                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5583                                                   ISC_XMLCHAR "connected");
5584                 if (sock->connecting)
5585                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5586                                                   ISC_XMLCHAR "connecting");
5587                 if (sock->bound)
5588                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5589                                                   ISC_XMLCHAR "bound");
5590
5591                 xmlTextWriterEndElement(writer); /* states */
5592
5593                 xmlTextWriterEndElement(writer); /* socket */
5594
5595                 UNLOCK(&sock->lock);
5596                 sock = ISC_LIST_NEXT(sock, link);
5597         }
5598         xmlTextWriterEndElement(writer); /* sockets */
5599
5600         UNLOCK(&mgr->lock);
5601 }
5602 #endif /* HAVE_LIBXML2 */