]> CyberLeo.Net >> Repos - FreeBSD/releng/8.0.git/blob - contrib/bind9/lib/isc/unix/socket.c
Adjust to reflect 8.0-RELEASE.
[FreeBSD/releng/8.0.git] / contrib / bind9 / lib / isc / unix / socket.c
1 /*
2  * Copyright (C) 2004-2009  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1998-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: socket.c,v 1.308.12.8 2009/04/18 01:29:26 jinmei Exp $ */
19
20 /*! \file */
21
22 #include <config.h>
23
24 #include <sys/param.h>
25 #include <sys/types.h>
26 #include <sys/socket.h>
27 #include <sys/stat.h>
28 #include <sys/time.h>
29 #include <sys/uio.h>
30
31 #include <errno.h>
32 #include <fcntl.h>
33 #include <stddef.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37
38 #include <isc/buffer.h>
39 #include <isc/bufferlist.h>
40 #include <isc/condition.h>
41 #include <isc/formatcheck.h>
42 #include <isc/list.h>
43 #include <isc/log.h>
44 #include <isc/mem.h>
45 #include <isc/msgs.h>
46 #include <isc/mutex.h>
47 #include <isc/net.h>
48 #include <isc/once.h>
49 #include <isc/platform.h>
50 #include <isc/print.h>
51 #include <isc/region.h>
52 #include <isc/socket.h>
53 #include <isc/stats.h>
54 #include <isc/strerror.h>
55 #include <isc/task.h>
56 #include <isc/thread.h>
57 #include <isc/util.h>
58 #include <isc/xml.h>
59
60 #ifdef ISC_PLATFORM_HAVESYSUNH
61 #include <sys/un.h>
62 #endif
63 #ifdef ISC_PLATFORM_HAVEKQUEUE
64 #include <sys/event.h>
65 #endif
66 #ifdef ISC_PLATFORM_HAVEEPOLL
67 #include <sys/epoll.h>
68 #endif
69 #ifdef ISC_PLATFORM_HAVEDEVPOLL
70 #include <sys/devpoll.h>
71 #endif
72
73 #include "errno2result.h"
74
75 #ifndef ISC_PLATFORM_USETHREADS
76 #include "socket_p.h"
77 #endif /* ISC_PLATFORM_USETHREADS */
78
79 #if defined(SO_BSDCOMPAT) && defined(__linux__)
80 #include <sys/utsname.h>
81 #endif
82
83 /*%
84  * Choose the most preferable multiplex method.
85  */
86 #ifdef ISC_PLATFORM_HAVEKQUEUE
87 #define USE_KQUEUE
88 #elif defined (ISC_PLATFORM_HAVEEPOLL)
89 #define USE_EPOLL
90 #elif defined (ISC_PLATFORM_HAVEDEVPOLL)
91 #define USE_DEVPOLL
92 typedef struct {
93         unsigned int want_read : 1,
94                 want_write : 1;
95 } pollinfo_t;
96 #else
97 #define USE_SELECT
98 #endif  /* ISC_PLATFORM_HAVEKQUEUE */
99
100 #ifndef ISC_PLATFORM_USETHREADS
101 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
102 struct isc_socketwait {
103         int nevents;
104 };
105 #elif defined (USE_SELECT)
106 struct isc_socketwait {
107         fd_set *readset;
108         fd_set *writeset;
109         int nfds;
110         int maxfd;
111 };
112 #endif  /* USE_KQUEUE */
113 #endif /* !ISC_PLATFORM_USETHREADS */
114
115 /*%
116  * Maximum number of allowable open sockets.  This is also the maximum
117  * allowable socket file descriptor.
118  *
119  * Care should be taken before modifying this value for select():
120  * The API standard doesn't ensure select() accept more than (the system default
121  * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
122  * the vast majority of cases.  This constant should therefore be increased only
123  * when absolutely necessary and possible, i.e., the server is exhausting all
124  * available file descriptors (up to FD_SETSIZE) and the select() function
125  * and FD_xxx macros support larger values than FD_SETSIZE (which may not
126  * always by true, but we keep using some of them to ensure as much
127  * portability as possible).  Note also that overall server performance
128  * may be rather worsened with a larger value of this constant due to
129  * inherent scalability problems of select().
130  *
131  * As a special note, this value shouldn't have to be touched if
132  * this is a build for an authoritative only DNS server.
133  */
134 #ifndef ISC_SOCKET_MAXSOCKETS
135 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
136 #define ISC_SOCKET_MAXSOCKETS 4096
137 #elif defined(USE_SELECT)
138 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
139 #endif  /* USE_KQUEUE... */
140 #endif  /* ISC_SOCKET_MAXSOCKETS */
141
142 #ifdef USE_SELECT
143 /*%
144  * Mac OS X needs a special definition to support larger values in select().
145  * We always define this because a larger value can be specified run-time.
146  */
147 #ifdef __APPLE__
148 #define _DARWIN_UNLIMITED_SELECT
149 #endif  /* __APPLE__ */
150 #endif  /* USE_SELECT */
151
152 #ifdef ISC_SOCKET_USE_POLLWATCH
153 /*%
154  * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
155  * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
156  * some of the specified FD.  The idea is based on the observation that it's
157  * likely for a busy server to keep receiving packets.  It specifically works
158  * as follows: the socket watcher is first initialized with the state of
159  * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
160  * event occurs.  When it wakes up for a socket I/O event, it moves to the
161  * poll_active state, and sets the poll timeout to a short period
162  * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
163  * watcher goes to the poll_checking state with the same timeout period.
164  * In this state, the watcher tries to detect whether this is a break
165  * during intermittent events or the kernel bug is triggered.  If the next
166  * polling reports an event within the short period, the previous timeout is
167  * likely to be a kernel bug, and so the watcher goes back to the active state.
168  * Otherwise, it moves to the idle state again.
169  *
170  * It's not clear whether this is a thread-related bug, but since we've only
171  * seen this with threads, this workaround is used only when enabling threads.
172  */
173
174 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
175
176 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
177 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
178 #endif  /* ISC_SOCKET_POLLWATCH_TIMEOUT */
179 #endif  /* ISC_SOCKET_USE_POLLWATCH */
180
181 /*%
182  * Size of per-FD lock buckets.
183  */
184 #ifdef ISC_PLATFORM_USETHREADS
185 #define FDLOCK_COUNT            1024
186 #define FDLOCK_ID(fd)           ((fd) % FDLOCK_COUNT)
187 #else
188 #define FDLOCK_COUNT            1
189 #define FDLOCK_ID(fd)           0
190 #endif  /* ISC_PLATFORM_USETHREADS */
191
192 /*%
193  * Maximum number of events communicated with the kernel.  There should normally
194  * be no need for having a large number.
195  */
196 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
197 #ifndef ISC_SOCKET_MAXEVENTS
198 #define ISC_SOCKET_MAXEVENTS    64
199 #endif
200 #endif
201
202 /*%
203  * Some systems define the socket length argument as an int, some as size_t,
204  * some as socklen_t.  This is here so it can be easily changed if needed.
205  */
206 #ifndef ISC_SOCKADDR_LEN_T
207 #define ISC_SOCKADDR_LEN_T unsigned int
208 #endif
209
210 /*%
211  * Define what the possible "soft" errors can be.  These are non-fatal returns
212  * of various network related functions, like recv() and so on.
213  *
214  * For some reason, BSDI (and perhaps others) will sometimes return <0
215  * from recv() but will have errno==0.  This is broken, but we have to
216  * work around it here.
217  */
218 #define SOFT_ERROR(e)   ((e) == EAGAIN || \
219                          (e) == EWOULDBLOCK || \
220                          (e) == EINTR || \
221                          (e) == 0)
222
223 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
224
225 /*!<
226  * DLVL(90)  --  Function entry/exit and other tracing.
227  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
228  * DLVL(60)  --  Socket data send/receive
229  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
230  * DLVL(20)  --  Socket creation/destruction.
231  */
232 #define TRACE_LEVEL             90
233 #define CORRECTNESS_LEVEL       70
234 #define IOEVENT_LEVEL           60
235 #define EVENT_LEVEL             50
236 #define CREATION_LEVEL          20
237
238 #define TRACE           DLVL(TRACE_LEVEL)
239 #define CORRECTNESS     DLVL(CORRECTNESS_LEVEL)
240 #define IOEVENT         DLVL(IOEVENT_LEVEL)
241 #define EVENT           DLVL(EVENT_LEVEL)
242 #define CREATION        DLVL(CREATION_LEVEL)
243
244 typedef isc_event_t intev_t;
245
246 #define SOCKET_MAGIC            ISC_MAGIC('I', 'O', 'i', 'o')
247 #define VALID_SOCKET(t)         ISC_MAGIC_VALID(t, SOCKET_MAGIC)
248
249 /*!
250  * IPv6 control information.  If the socket is an IPv6 socket we want
251  * to collect the destination address and interface so the client can
252  * set them on outgoing packets.
253  */
254 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
255 #ifndef USE_CMSG
256 #define USE_CMSG        1
257 #endif
258 #endif
259
260 /*%
261  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
262  * a setsockopt() like interface to request timestamps, and if the OS
263  * doesn't do it for us, call gettimeofday() on every UDP receive?
264  */
265 #ifdef SO_TIMESTAMP
266 #ifndef USE_CMSG
267 #define USE_CMSG        1
268 #endif
269 #endif
270
271 /*%
272  * The size to raise the receive buffer to (from BIND 8).
273  */
274 #define RCVBUFSIZE (32*1024)
275
276 /*%
277  * The number of times a send operation is repeated if the result is EINTR.
278  */
279 #define NRETRIES 10
280
281 struct isc_socket {
282         /* Not locked. */
283         unsigned int            magic;
284         isc_socketmgr_t        *manager;
285         isc_mutex_t             lock;
286         isc_sockettype_t        type;
287         const isc_statscounter_t        *statsindex;
288
289         /* Locked by socket lock. */
290         ISC_LINK(isc_socket_t)  link;
291         unsigned int            references;
292         int                     fd;
293         int                     pf;
294         char                            name[16];
295         void *                          tag;
296
297         ISC_LIST(isc_socketevent_t)             send_list;
298         ISC_LIST(isc_socketevent_t)             recv_list;
299         ISC_LIST(isc_socket_newconnev_t)        accept_list;
300         isc_socket_connev_t                    *connect_ev;
301
302         /*
303          * Internal events.  Posted when a descriptor is readable or
304          * writable.  These are statically allocated and never freed.
305          * They will be set to non-purgable before use.
306          */
307         intev_t                 readable_ev;
308         intev_t                 writable_ev;
309
310         isc_sockaddr_t          peer_address;  /* remote address */
311
312         unsigned int            pending_recv : 1,
313                                 pending_send : 1,
314                                 pending_accept : 1,
315                                 listener : 1, /* listener socket */
316                                 connected : 1,
317                                 connecting : 1, /* connect pending */
318                                 bound : 1; /* bound to local addr */
319
320 #ifdef ISC_NET_RECVOVERFLOW
321         unsigned char           overflow; /* used for MSG_TRUNC fake */
322 #endif
323
324         char                    *recvcmsgbuf;
325         ISC_SOCKADDR_LEN_T      recvcmsgbuflen;
326         char                    *sendcmsgbuf;
327         ISC_SOCKADDR_LEN_T      sendcmsgbuflen;
328
329         void                    *fdwatcharg;
330         isc_sockfdwatch_t       fdwatchcb;
331         int                     fdwatchflags;
332         isc_task_t              *fdwatchtask;
333 };
334
335 #define SOCKET_MANAGER_MAGIC    ISC_MAGIC('I', 'O', 'm', 'g')
336 #define VALID_MANAGER(m)        ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
337
338 struct isc_socketmgr {
339         /* Not locked. */
340         unsigned int            magic;
341         isc_mem_t              *mctx;
342         isc_mutex_t             lock;
343         isc_mutex_t             *fdlock;
344         isc_stats_t             *stats;
345 #ifdef USE_KQUEUE
346         int                     kqueue_fd;
347         int                     nevents;
348         struct kevent           *events;
349 #endif  /* USE_KQUEUE */
350 #ifdef USE_EPOLL
351         int                     epoll_fd;
352         int                     nevents;
353         struct epoll_event      *events;
354 #endif  /* USE_EPOLL */
355 #ifdef USE_DEVPOLL
356         int                     devpoll_fd;
357         int                     nevents;
358         struct pollfd           *events;
359 #endif  /* USE_DEVPOLL */
360 #ifdef USE_SELECT
361         int                     fd_bufsize;
362 #endif  /* USE_SELECT */
363         unsigned int            maxsocks;
364 #ifdef ISC_PLATFORM_USETHREADS
365         int                     pipe_fds[2];
366 #endif
367
368         /* Locked by fdlock. */
369         isc_socket_t           **fds;
370         int                     *fdstate;
371 #ifdef USE_DEVPOLL
372         pollinfo_t              *fdpollinfo;
373 #endif
374
375         /* Locked by manager lock. */
376         ISC_LIST(isc_socket_t)  socklist;
377 #ifdef USE_SELECT
378         fd_set                  *read_fds;
379         fd_set                  *read_fds_copy;
380         fd_set                  *write_fds;
381         fd_set                  *write_fds_copy;
382         int                     maxfd;
383 #endif  /* USE_SELECT */
384         int                     reserved;       /* unlocked */
385 #ifdef ISC_PLATFORM_USETHREADS
386         isc_thread_t            watcher;
387         isc_condition_t         shutdown_ok;
388 #else /* ISC_PLATFORM_USETHREADS */
389         unsigned int            refs;
390 #endif /* ISC_PLATFORM_USETHREADS */
391 };
392
393 #ifndef ISC_PLATFORM_USETHREADS
394 static isc_socketmgr_t *socketmgr = NULL;
395 #endif /* ISC_PLATFORM_USETHREADS */
396
397 #define CLOSED                  0       /* this one must be zero */
398 #define MANAGED                 1
399 #define CLOSE_PENDING           2
400
401 /*
402  * send() and recv() iovec counts
403  */
404 #define MAXSCATTERGATHER_SEND   (ISC_SOCKET_MAXSCATTERGATHER)
405 #ifdef ISC_NET_RECVOVERFLOW
406 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER + 1)
407 #else
408 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER)
409 #endif
410
411 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
412 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
413 static void free_socket(isc_socket_t **);
414 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
415                                     isc_socket_t **);
416 static void destroy(isc_socket_t **);
417 static void internal_accept(isc_task_t *, isc_event_t *);
418 static void internal_connect(isc_task_t *, isc_event_t *);
419 static void internal_recv(isc_task_t *, isc_event_t *);
420 static void internal_send(isc_task_t *, isc_event_t *);
421 static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
422 static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
423 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
424 static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
425                               struct msghdr *, struct iovec *, size_t *);
426 static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
427                               struct msghdr *, struct iovec *, size_t *);
428 #ifdef ISC_PLATFORM_USETHREADS
429 static isc_boolean_t process_ctlfd(isc_socketmgr_t *manager);
430 #endif
431
432 #define SELECT_POKE_SHUTDOWN            (-1)
433 #define SELECT_POKE_NOTHING             (-2)
434 #define SELECT_POKE_READ                (-3)
435 #define SELECT_POKE_ACCEPT              (-3) /*%< Same as _READ */
436 #define SELECT_POKE_WRITE               (-4)
437 #define SELECT_POKE_CONNECT             (-4) /*%< Same as _WRITE */
438 #define SELECT_POKE_CLOSE               (-5)
439
440 #define SOCK_DEAD(s)                    ((s)->references == 0)
441
442 /*%
443  * Shortcut index arrays to get access to statistics counters.
444  */
445 enum {
446         STATID_OPEN = 0,
447         STATID_OPENFAIL = 1,
448         STATID_CLOSE = 2,
449         STATID_BINDFAIL = 3,
450         STATID_CONNECTFAIL = 4,
451         STATID_CONNECT = 5,
452         STATID_ACCEPTFAIL = 6,
453         STATID_ACCEPT = 7,
454         STATID_SENDFAIL = 8,
455         STATID_RECVFAIL = 9
456 };
457 static const isc_statscounter_t upd4statsindex[] = {
458         isc_sockstatscounter_udp4open,
459         isc_sockstatscounter_udp4openfail,
460         isc_sockstatscounter_udp4close,
461         isc_sockstatscounter_udp4bindfail,
462         isc_sockstatscounter_udp4connectfail,
463         isc_sockstatscounter_udp4connect,
464         -1,
465         -1,
466         isc_sockstatscounter_udp4sendfail,
467         isc_sockstatscounter_udp4recvfail
468 };
469 static const isc_statscounter_t upd6statsindex[] = {
470         isc_sockstatscounter_udp6open,
471         isc_sockstatscounter_udp6openfail,
472         isc_sockstatscounter_udp6close,
473         isc_sockstatscounter_udp6bindfail,
474         isc_sockstatscounter_udp6connectfail,
475         isc_sockstatscounter_udp6connect,
476         -1,
477         -1,
478         isc_sockstatscounter_udp6sendfail,
479         isc_sockstatscounter_udp6recvfail
480 };
481 static const isc_statscounter_t tcp4statsindex[] = {
482         isc_sockstatscounter_tcp4open,
483         isc_sockstatscounter_tcp4openfail,
484         isc_sockstatscounter_tcp4close,
485         isc_sockstatscounter_tcp4bindfail,
486         isc_sockstatscounter_tcp4connectfail,
487         isc_sockstatscounter_tcp4connect,
488         isc_sockstatscounter_tcp4acceptfail,
489         isc_sockstatscounter_tcp4accept,
490         isc_sockstatscounter_tcp4sendfail,
491         isc_sockstatscounter_tcp4recvfail
492 };
493 static const isc_statscounter_t tcp6statsindex[] = {
494         isc_sockstatscounter_tcp6open,
495         isc_sockstatscounter_tcp6openfail,
496         isc_sockstatscounter_tcp6close,
497         isc_sockstatscounter_tcp6bindfail,
498         isc_sockstatscounter_tcp6connectfail,
499         isc_sockstatscounter_tcp6connect,
500         isc_sockstatscounter_tcp6acceptfail,
501         isc_sockstatscounter_tcp6accept,
502         isc_sockstatscounter_tcp6sendfail,
503         isc_sockstatscounter_tcp6recvfail
504 };
505 static const isc_statscounter_t unixstatsindex[] = {
506         isc_sockstatscounter_unixopen,
507         isc_sockstatscounter_unixopenfail,
508         isc_sockstatscounter_unixclose,
509         isc_sockstatscounter_unixbindfail,
510         isc_sockstatscounter_unixconnectfail,
511         isc_sockstatscounter_unixconnect,
512         isc_sockstatscounter_unixacceptfail,
513         isc_sockstatscounter_unixaccept,
514         isc_sockstatscounter_unixsendfail,
515         isc_sockstatscounter_unixrecvfail
516 };
517 static const isc_statscounter_t fdwatchstatsindex[] = {
518         -1,
519         -1,
520         isc_sockstatscounter_fdwatchclose,
521         isc_sockstatscounter_fdwatchbindfail,
522         isc_sockstatscounter_fdwatchconnectfail,
523         isc_sockstatscounter_fdwatchconnect,
524         -1,
525         -1,
526         isc_sockstatscounter_fdwatchsendfail,
527         isc_sockstatscounter_fdwatchrecvfail
528 };
529
530 static void
531 manager_log(isc_socketmgr_t *sockmgr,
532             isc_logcategory_t *category, isc_logmodule_t *module, int level,
533             const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
534 static void
535 manager_log(isc_socketmgr_t *sockmgr,
536             isc_logcategory_t *category, isc_logmodule_t *module, int level,
537             const char *fmt, ...)
538 {
539         char msgbuf[2048];
540         va_list ap;
541
542         if (! isc_log_wouldlog(isc_lctx, level))
543                 return;
544
545         va_start(ap, fmt);
546         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
547         va_end(ap);
548
549         isc_log_write(isc_lctx, category, module, level,
550                       "sockmgr %p: %s", sockmgr, msgbuf);
551 }
552
553 static void
554 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
555            isc_logcategory_t *category, isc_logmodule_t *module, int level,
556            isc_msgcat_t *msgcat, int msgset, int message,
557            const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
558 static void
559 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
560            isc_logcategory_t *category, isc_logmodule_t *module, int level,
561            isc_msgcat_t *msgcat, int msgset, int message,
562            const char *fmt, ...)
563 {
564         char msgbuf[2048];
565         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
566         va_list ap;
567
568         if (! isc_log_wouldlog(isc_lctx, level))
569                 return;
570
571         va_start(ap, fmt);
572         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
573         va_end(ap);
574
575         if (address == NULL) {
576                 isc_log_iwrite(isc_lctx, category, module, level,
577                                msgcat, msgset, message,
578                                "socket %p: %s", sock, msgbuf);
579         } else {
580                 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
581                 isc_log_iwrite(isc_lctx, category, module, level,
582                                msgcat, msgset, message,
583                                "socket %p %s: %s", sock, peerbuf, msgbuf);
584         }
585 }
586
587 #if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
588     defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
589 /*
590  * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
591  * setting IPV6_V6ONLY.
592  */
593 static void
594 FIX_IPV6_RECVPKTINFO(isc_socket_t *sock)
595 {
596         char strbuf[ISC_STRERRORSIZE];
597         int on = 1;
598
599         if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
600                 return;
601
602         if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
603                        (void *)&on, sizeof(on)) < 0) {
604
605                 UNEXPECTED_ERROR(__FILE__, __LINE__,
606                                  "setsockopt(%d, IPV6_RECVPKTINFO) "
607                                  "%s: %s", sock->fd,
608                                  isc_msgcat_get(isc_msgcat,
609                                                 ISC_MSGSET_GENERAL,
610                                                 ISC_MSG_FAILED,
611                                                 "failed"),
612                                  strbuf);
613         }
614 }
615 #else
616 #define FIX_IPV6_RECVPKTINFO(sock) (void)0
617 #endif
618
619 /*%
620  * Increment socket-related statistics counters.
621  */
622 static inline void
623 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
624         REQUIRE(counterid != -1);
625
626         if (stats != NULL)
627                 isc_stats_increment(stats, counterid);
628 }
629
630 static inline isc_result_t
631 watch_fd(isc_socketmgr_t *manager, int fd, int msg) {
632         isc_result_t result = ISC_R_SUCCESS;
633
634 #ifdef USE_KQUEUE
635         struct kevent evchange;
636
637         memset(&evchange, 0, sizeof(evchange));
638         if (msg == SELECT_POKE_READ)
639                 evchange.filter = EVFILT_READ;
640         else
641                 evchange.filter = EVFILT_WRITE;
642         evchange.flags = EV_ADD;
643         evchange.ident = fd;
644         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
645                 result = isc__errno2result(errno);
646
647         return (result);
648 #elif defined(USE_EPOLL)
649         struct epoll_event event;
650
651         if (msg == SELECT_POKE_READ)
652                 event.events = EPOLLIN;
653         else
654                 event.events = EPOLLOUT;
655         event.data.fd = fd;
656         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
657             errno != EEXIST) {
658                 result = isc__errno2result(errno);
659         }
660
661         return (result);
662 #elif defined(USE_DEVPOLL)
663         struct pollfd pfd;
664         int lockid = FDLOCK_ID(fd);
665
666         memset(&pfd, 0, sizeof(pfd));
667         if (msg == SELECT_POKE_READ)
668                 pfd.events = POLLIN;
669         else
670                 pfd.events = POLLOUT;
671         pfd.fd = fd;
672         pfd.revents = 0;
673         LOCK(&manager->fdlock[lockid]);
674         if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
675                 result = isc__errno2result(errno);
676         else {
677                 if (msg == SELECT_POKE_READ)
678                         manager->fdpollinfo[fd].want_read = 1;
679                 else
680                         manager->fdpollinfo[fd].want_write = 1;
681         }
682         UNLOCK(&manager->fdlock[lockid]);
683
684         return (result);
685 #elif defined(USE_SELECT)
686         LOCK(&manager->lock);
687         if (msg == SELECT_POKE_READ)
688                 FD_SET(fd, manager->read_fds);
689         if (msg == SELECT_POKE_WRITE)
690                 FD_SET(fd, manager->write_fds);
691         UNLOCK(&manager->lock);
692
693         return (result);
694 #endif
695 }
696
697 static inline isc_result_t
698 unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) {
699         isc_result_t result = ISC_R_SUCCESS;
700
701 #ifdef USE_KQUEUE
702         struct kevent evchange;
703
704         memset(&evchange, 0, sizeof(evchange));
705         if (msg == SELECT_POKE_READ)
706                 evchange.filter = EVFILT_READ;
707         else
708                 evchange.filter = EVFILT_WRITE;
709         evchange.flags = EV_DELETE;
710         evchange.ident = fd;
711         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
712                 result = isc__errno2result(errno);
713
714         return (result);
715 #elif defined(USE_EPOLL)
716         struct epoll_event event;
717
718         if (msg == SELECT_POKE_READ)
719                 event.events = EPOLLIN;
720         else
721                 event.events = EPOLLOUT;
722         event.data.fd = fd;
723         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
724             errno != ENOENT) {
725                 char strbuf[ISC_STRERRORSIZE];
726                 isc__strerror(errno, strbuf, sizeof(strbuf));
727                 UNEXPECTED_ERROR(__FILE__, __LINE__,
728                                  "epoll_ctl(DEL), %d: %s", fd, strbuf);
729                 result = ISC_R_UNEXPECTED;
730         }
731         return (result);
732 #elif defined(USE_DEVPOLL)
733         struct pollfd pfds[2];
734         size_t writelen = sizeof(pfds[0]);
735         int lockid = FDLOCK_ID(fd);
736
737         memset(pfds, 0, sizeof(pfds));
738         pfds[0].events = POLLREMOVE;
739         pfds[0].fd = fd;
740
741         /*
742          * Canceling read or write polling via /dev/poll is tricky.  Since it
743          * only provides a way of canceling per FD, we may need to re-poll the
744          * socket for the other operation.
745          */
746         LOCK(&manager->fdlock[lockid]);
747         if (msg == SELECT_POKE_READ &&
748             manager->fdpollinfo[fd].want_write == 1) {
749                 pfds[1].events = POLLOUT;
750                 pfds[1].fd = fd;
751                 writelen += sizeof(pfds[1]);
752         }
753         if (msg == SELECT_POKE_WRITE &&
754             manager->fdpollinfo[fd].want_read == 1) {
755                 pfds[1].events = POLLIN;
756                 pfds[1].fd = fd;
757                 writelen += sizeof(pfds[1]);
758         }
759
760         if (write(manager->devpoll_fd, pfds, writelen) == -1)
761                 result = isc__errno2result(errno);
762         else {
763                 if (msg == SELECT_POKE_READ)
764                         manager->fdpollinfo[fd].want_read = 0;
765                 else
766                         manager->fdpollinfo[fd].want_write = 0;
767         }
768         UNLOCK(&manager->fdlock[lockid]);
769
770         return (result);
771 #elif defined(USE_SELECT)
772         LOCK(&manager->lock);
773         if (msg == SELECT_POKE_READ)
774                 FD_CLR(fd, manager->read_fds);
775         else if (msg == SELECT_POKE_WRITE)
776                 FD_CLR(fd, manager->write_fds);
777         UNLOCK(&manager->lock);
778
779         return (result);
780 #endif
781 }
782
783 static void
784 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
785         isc_result_t result;
786         int lockid = FDLOCK_ID(fd);
787
788         /*
789          * This is a wakeup on a socket.  If the socket is not in the
790          * process of being closed, start watching it for either reads
791          * or writes.
792          */
793
794         INSIST(fd >= 0 && fd < (int)manager->maxsocks);
795
796         if (msg == SELECT_POKE_CLOSE) {
797                 /* No one should be updating fdstate, so no need to lock it */
798                 INSIST(manager->fdstate[fd] == CLOSE_PENDING);
799                 manager->fdstate[fd] = CLOSED;
800                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
801                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
802                 (void)close(fd);
803                 return;
804         }
805
806         LOCK(&manager->fdlock[lockid]);
807         if (manager->fdstate[fd] == CLOSE_PENDING) {
808                 UNLOCK(&manager->fdlock[lockid]);
809
810                 /*
811                  * We accept (and ignore) any error from unwatch_fd() as we are
812                  * closing the socket, hoping it doesn't leave dangling state in
813                  * the kernel.
814                  * Note that unwatch_fd() must be called after releasing the
815                  * fdlock; otherwise it could cause deadlock due to a lock order
816                  * reversal.
817                  */
818                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
819                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
820                 return;
821         }
822         if (manager->fdstate[fd] != MANAGED) {
823                 UNLOCK(&manager->fdlock[lockid]);
824                 return;
825         }
826         UNLOCK(&manager->fdlock[lockid]);
827
828         /*
829          * Set requested bit.
830          */
831         result = watch_fd(manager, fd, msg);
832         if (result != ISC_R_SUCCESS) {
833                 /*
834                  * XXXJT: what should we do?  Ignoring the failure of watching
835                  * a socket will make the application dysfunctional, but there
836                  * seems to be no reasonable recovery process.
837                  */
838                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
839                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
840                               "failed to start watching FD (%d): %s",
841                               fd, isc_result_totext(result));
842         }
843 }
844
845 #ifdef ISC_PLATFORM_USETHREADS
846 /*
847  * Poke the select loop when there is something for us to do.
848  * The write is required (by POSIX) to complete.  That is, we
849  * will not get partial writes.
850  */
851 static void
852 select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
853         int cc;
854         int buf[2];
855         char strbuf[ISC_STRERRORSIZE];
856
857         buf[0] = fd;
858         buf[1] = msg;
859
860         do {
861                 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
862 #ifdef ENOSR
863                 /*
864                  * Treat ENOSR as EAGAIN but loop slowly as it is
865                  * unlikely to clear fast.
866                  */
867                 if (cc < 0 && errno == ENOSR) {
868                         sleep(1);
869                         errno = EAGAIN;
870                 }
871 #endif
872         } while (cc < 0 && SOFT_ERROR(errno));
873
874         if (cc < 0) {
875                 isc__strerror(errno, strbuf, sizeof(strbuf));
876                 FATAL_ERROR(__FILE__, __LINE__,
877                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
878                                            ISC_MSG_WRITEFAILED,
879                                            "write() failed "
880                                            "during watcher poke: %s"),
881                             strbuf);
882         }
883
884         INSIST(cc == sizeof(buf));
885 }
886
887 /*
888  * Read a message on the internal fd.
889  */
890 static void
891 select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
892         int buf[2];
893         int cc;
894         char strbuf[ISC_STRERRORSIZE];
895
896         cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
897         if (cc < 0) {
898                 *msg = SELECT_POKE_NOTHING;
899                 *fd = -1;       /* Silence compiler. */
900                 if (SOFT_ERROR(errno))
901                         return;
902
903                 isc__strerror(errno, strbuf, sizeof(strbuf));
904                 FATAL_ERROR(__FILE__, __LINE__,
905                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
906                                            ISC_MSG_READFAILED,
907                                            "read() failed "
908                                            "during watcher poke: %s"),
909                             strbuf);
910
911                 return;
912         }
913         INSIST(cc == sizeof(buf));
914
915         *fd = buf[0];
916         *msg = buf[1];
917 }
918 #else /* ISC_PLATFORM_USETHREADS */
919 /*
920  * Update the state of the socketmgr when something changes.
921  */
922 static void
923 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
924         if (msg == SELECT_POKE_SHUTDOWN)
925                 return;
926         else if (fd >= 0)
927                 wakeup_socket(manager, fd, msg);
928         return;
929 }
930 #endif /* ISC_PLATFORM_USETHREADS */
931
932 /*
933  * Make a fd non-blocking.
934  */
935 static isc_result_t
936 make_nonblock(int fd) {
937         int ret;
938         int flags;
939         char strbuf[ISC_STRERRORSIZE];
940 #ifdef USE_FIONBIO_IOCTL
941         int on = 1;
942
943         ret = ioctl(fd, FIONBIO, (char *)&on);
944 #else
945         flags = fcntl(fd, F_GETFL, 0);
946         flags |= PORT_NONBLOCK;
947         ret = fcntl(fd, F_SETFL, flags);
948 #endif
949
950         if (ret == -1) {
951                 isc__strerror(errno, strbuf, sizeof(strbuf));
952                 UNEXPECTED_ERROR(__FILE__, __LINE__,
953 #ifdef USE_FIONBIO_IOCTL
954                                  "ioctl(%d, FIONBIO, &on): %s", fd,
955 #else
956                                  "fcntl(%d, F_SETFL, %d): %s", fd, flags,
957 #endif
958                                  strbuf);
959
960                 return (ISC_R_UNEXPECTED);
961         }
962
963         return (ISC_R_SUCCESS);
964 }
965
966 #ifdef USE_CMSG
967 /*
968  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
969  * In order to ensure as much portability as possible, we provide wrapper
970  * functions of these macros.
971  * Note that cmsg_space() could run slow on OSes that do not have
972  * CMSG_SPACE.
973  */
974 static inline ISC_SOCKADDR_LEN_T
975 cmsg_len(ISC_SOCKADDR_LEN_T len) {
976 #ifdef CMSG_LEN
977         return (CMSG_LEN(len));
978 #else
979         ISC_SOCKADDR_LEN_T hdrlen;
980
981         /*
982          * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
983          * is correct.
984          */
985         hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
986         return (hdrlen + len);
987 #endif
988 }
989
990 static inline ISC_SOCKADDR_LEN_T
991 cmsg_space(ISC_SOCKADDR_LEN_T len) {
992 #ifdef CMSG_SPACE
993         return (CMSG_SPACE(len));
994 #else
995         struct msghdr msg;
996         struct cmsghdr *cmsgp;
997         /*
998          * XXX: The buffer length is an ad-hoc value, but should be enough
999          * in a practical sense.
1000          */
1001         char dummybuf[sizeof(struct cmsghdr) + 1024];
1002
1003         memset(&msg, 0, sizeof(msg));
1004         msg.msg_control = dummybuf;
1005         msg.msg_controllen = sizeof(dummybuf);
1006
1007         cmsgp = (struct cmsghdr *)dummybuf;
1008         cmsgp->cmsg_len = cmsg_len(len);
1009
1010         cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1011         if (cmsgp != NULL)
1012                 return ((char *)cmsgp - (char *)msg.msg_control);
1013         else
1014                 return (0);
1015 #endif
1016 }
1017 #endif /* USE_CMSG */
1018
1019 /*
1020  * Process control messages received on a socket.
1021  */
1022 static void
1023 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1024 #ifdef USE_CMSG
1025         struct cmsghdr *cmsgp;
1026 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1027         struct in6_pktinfo *pktinfop;
1028 #endif
1029 #ifdef SO_TIMESTAMP
1030         struct timeval *timevalp;
1031 #endif
1032 #endif
1033
1034         /*
1035          * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1036          * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1037          * They are all here, outside of the CPP tests, because it is
1038          * more consistent with the usual ISC coding style.
1039          */
1040         UNUSED(sock);
1041         UNUSED(msg);
1042         UNUSED(dev);
1043
1044 #ifdef ISC_NET_BSD44MSGHDR
1045
1046 #ifdef MSG_TRUNC
1047         if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
1048                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1049 #endif
1050
1051 #ifdef MSG_CTRUNC
1052         if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
1053                 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1054 #endif
1055
1056 #ifndef USE_CMSG
1057         return;
1058 #else
1059         if (msg->msg_controllen == 0U || msg->msg_control == NULL)
1060                 return;
1061
1062 #ifdef SO_TIMESTAMP
1063         timevalp = NULL;
1064 #endif
1065 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1066         pktinfop = NULL;
1067 #endif
1068
1069         cmsgp = CMSG_FIRSTHDR(msg);
1070         while (cmsgp != NULL) {
1071                 socket_log(sock, NULL, TRACE,
1072                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
1073                            "processing cmsg %p", cmsgp);
1074
1075 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1076                 if (cmsgp->cmsg_level == IPPROTO_IPV6
1077                     && cmsgp->cmsg_type == IPV6_PKTINFO) {
1078
1079                         pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1080                         memcpy(&dev->pktinfo, pktinfop,
1081                                sizeof(struct in6_pktinfo));
1082                         dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1083                         socket_log(sock, NULL, TRACE,
1084                                    isc_msgcat, ISC_MSGSET_SOCKET,
1085                                    ISC_MSG_IFRECEIVED,
1086                                    "interface received on ifindex %u",
1087                                    dev->pktinfo.ipi6_ifindex);
1088                         if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
1089                                 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1090                         goto next;
1091                 }
1092 #endif
1093
1094 #ifdef SO_TIMESTAMP
1095                 if (cmsgp->cmsg_level == SOL_SOCKET
1096                     && cmsgp->cmsg_type == SCM_TIMESTAMP) {
1097                         timevalp = (struct timeval *)CMSG_DATA(cmsgp);
1098                         dev->timestamp.seconds = timevalp->tv_sec;
1099                         dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
1100                         dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1101                         goto next;
1102                 }
1103 #endif
1104
1105         next:
1106                 cmsgp = CMSG_NXTHDR(msg, cmsgp);
1107         }
1108 #endif /* USE_CMSG */
1109
1110 #endif /* ISC_NET_BSD44MSGHDR */
1111 }
1112
1113 /*
1114  * Construct an iov array and attach it to the msghdr passed in.  This is
1115  * the SEND constructor, which will use the used region of the buffer
1116  * (if using a buffer list) or will use the internal region (if a single
1117  * buffer I/O is requested).
1118  *
1119  * Nothing can be NULL, and the done event must list at least one buffer
1120  * on the buffer linked list for this function to be meaningful.
1121  *
1122  * If write_countp != NULL, *write_countp will hold the number of bytes
1123  * this transaction can send.
1124  */
1125 static void
1126 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
1127                   struct msghdr *msg, struct iovec *iov, size_t *write_countp)
1128 {
1129         unsigned int iovcount;
1130         isc_buffer_t *buffer;
1131         isc_region_t used;
1132         size_t write_count;
1133         size_t skip_count;
1134
1135         memset(msg, 0, sizeof(*msg));
1136
1137         if (!sock->connected) {
1138                 msg->msg_name = (void *)&dev->address.type.sa;
1139                 msg->msg_namelen = dev->address.length;
1140         } else {
1141                 msg->msg_name = NULL;
1142                 msg->msg_namelen = 0;
1143         }
1144
1145         buffer = ISC_LIST_HEAD(dev->bufferlist);
1146         write_count = 0;
1147         iovcount = 0;
1148
1149         /*
1150          * Single buffer I/O?  Skip what we've done so far in this region.
1151          */
1152         if (buffer == NULL) {
1153                 write_count = dev->region.length - dev->n;
1154                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1155                 iov[0].iov_len = write_count;
1156                 iovcount = 1;
1157
1158                 goto config;
1159         }
1160
1161         /*
1162          * Multibuffer I/O.
1163          * Skip the data in the buffer list that we have already written.
1164          */
1165         skip_count = dev->n;
1166         while (buffer != NULL) {
1167                 REQUIRE(ISC_BUFFER_VALID(buffer));
1168                 if (skip_count < isc_buffer_usedlength(buffer))
1169                         break;
1170                 skip_count -= isc_buffer_usedlength(buffer);
1171                 buffer = ISC_LIST_NEXT(buffer, link);
1172         }
1173
1174         while (buffer != NULL) {
1175                 INSIST(iovcount < MAXSCATTERGATHER_SEND);
1176
1177                 isc_buffer_usedregion(buffer, &used);
1178
1179                 if (used.length > 0) {
1180                         iov[iovcount].iov_base = (void *)(used.base
1181                                                           + skip_count);
1182                         iov[iovcount].iov_len = used.length - skip_count;
1183                         write_count += (used.length - skip_count);
1184                         skip_count = 0;
1185                         iovcount++;
1186                 }
1187                 buffer = ISC_LIST_NEXT(buffer, link);
1188         }
1189
1190         INSIST(skip_count == 0U);
1191
1192  config:
1193         msg->msg_iov = iov;
1194         msg->msg_iovlen = iovcount;
1195
1196 #ifdef ISC_NET_BSD44MSGHDR
1197         msg->msg_control = NULL;
1198         msg->msg_controllen = 0;
1199         msg->msg_flags = 0;
1200 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1201         if ((sock->type == isc_sockettype_udp)
1202             && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
1203                 struct cmsghdr *cmsgp;
1204                 struct in6_pktinfo *pktinfop;
1205
1206                 socket_log(sock, NULL, TRACE,
1207                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
1208                            "sendto pktinfo data, ifindex %u",
1209                            dev->pktinfo.ipi6_ifindex);
1210
1211                 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1212                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1213                 msg->msg_control = (void *)sock->sendcmsgbuf;
1214
1215                 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
1216                 cmsgp->cmsg_level = IPPROTO_IPV6;
1217                 cmsgp->cmsg_type = IPV6_PKTINFO;
1218                 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1219                 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1220                 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1221         }
1222 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
1223 #else /* ISC_NET_BSD44MSGHDR */
1224         msg->msg_accrights = NULL;
1225         msg->msg_accrightslen = 0;
1226 #endif /* ISC_NET_BSD44MSGHDR */
1227
1228         if (write_countp != NULL)
1229                 *write_countp = write_count;
1230 }
1231
1232 /*
1233  * Construct an iov array and attach it to the msghdr passed in.  This is
1234  * the RECV constructor, which will use the available region of the buffer
1235  * (if using a buffer list) or will use the internal region (if a single
1236  * buffer I/O is requested).
1237  *
1238  * Nothing can be NULL, and the done event must list at least one buffer
1239  * on the buffer linked list for this function to be meaningful.
1240  *
1241  * If read_countp != NULL, *read_countp will hold the number of bytes
1242  * this transaction can receive.
1243  */
1244 static void
1245 build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
1246                   struct msghdr *msg, struct iovec *iov, size_t *read_countp)
1247 {
1248         unsigned int iovcount;
1249         isc_buffer_t *buffer;
1250         isc_region_t available;
1251         size_t read_count;
1252
1253         memset(msg, 0, sizeof(struct msghdr));
1254
1255         if (sock->type == isc_sockettype_udp) {
1256                 memset(&dev->address, 0, sizeof(dev->address));
1257 #ifdef BROKEN_RECVMSG
1258                 if (sock->pf == AF_INET) {
1259                         msg->msg_name = (void *)&dev->address.type.sin;
1260                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1261                 } else if (sock->pf == AF_INET6) {
1262                         msg->msg_name = (void *)&dev->address.type.sin6;
1263                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1264 #ifdef ISC_PLATFORM_HAVESYSUNH
1265                 } else if (sock->pf == AF_UNIX) {
1266                         msg->msg_name = (void *)&dev->address.type.sunix;
1267                         msg->msg_namelen = sizeof(dev->address.type.sunix);
1268 #endif
1269                 } else {
1270                         msg->msg_name = (void *)&dev->address.type.sa;
1271                         msg->msg_namelen = sizeof(dev->address.type);
1272                 }
1273 #else
1274                 msg->msg_name = (void *)&dev->address.type.sa;
1275                 msg->msg_namelen = sizeof(dev->address.type);
1276 #endif
1277 #ifdef ISC_NET_RECVOVERFLOW
1278                 /* If needed, steal one iovec for overflow detection. */
1279                 maxiov--;
1280 #endif
1281         } else { /* TCP */
1282                 msg->msg_name = NULL;
1283                 msg->msg_namelen = 0;
1284                 dev->address = sock->peer_address;
1285         }
1286
1287         buffer = ISC_LIST_HEAD(dev->bufferlist);
1288         read_count = 0;
1289
1290         /*
1291          * Single buffer I/O?  Skip what we've done so far in this region.
1292          */
1293         if (buffer == NULL) {
1294                 read_count = dev->region.length - dev->n;
1295                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1296                 iov[0].iov_len = read_count;
1297                 iovcount = 1;
1298
1299                 goto config;
1300         }
1301
1302         /*
1303          * Multibuffer I/O.
1304          * Skip empty buffers.
1305          */
1306         while (buffer != NULL) {
1307                 REQUIRE(ISC_BUFFER_VALID(buffer));
1308                 if (isc_buffer_availablelength(buffer) != 0)
1309                         break;
1310                 buffer = ISC_LIST_NEXT(buffer, link);
1311         }
1312
1313         iovcount = 0;
1314         while (buffer != NULL) {
1315                 INSIST(iovcount < MAXSCATTERGATHER_RECV);
1316
1317                 isc_buffer_availableregion(buffer, &available);
1318
1319                 if (available.length > 0) {
1320                         iov[iovcount].iov_base = (void *)(available.base);
1321                         iov[iovcount].iov_len = available.length;
1322                         read_count += available.length;
1323                         iovcount++;
1324                 }
1325                 buffer = ISC_LIST_NEXT(buffer, link);
1326         }
1327
1328  config:
1329
1330         /*
1331          * If needed, set up to receive that one extra byte.  Note that
1332          * we know there is at least one iov left, since we stole it
1333          * at the top of this function.
1334          */
1335 #ifdef ISC_NET_RECVOVERFLOW
1336         if (sock->type == isc_sockettype_udp) {
1337                 iov[iovcount].iov_base = (void *)(&sock->overflow);
1338                 iov[iovcount].iov_len = 1;
1339                 iovcount++;
1340         }
1341 #endif
1342
1343         msg->msg_iov = iov;
1344         msg->msg_iovlen = iovcount;
1345
1346 #ifdef ISC_NET_BSD44MSGHDR
1347         msg->msg_control = NULL;
1348         msg->msg_controllen = 0;
1349         msg->msg_flags = 0;
1350 #if defined(USE_CMSG)
1351         if (sock->type == isc_sockettype_udp) {
1352                 msg->msg_control = sock->recvcmsgbuf;
1353                 msg->msg_controllen = sock->recvcmsgbuflen;
1354         }
1355 #endif /* USE_CMSG */
1356 #else /* ISC_NET_BSD44MSGHDR */
1357         msg->msg_accrights = NULL;
1358         msg->msg_accrightslen = 0;
1359 #endif /* ISC_NET_BSD44MSGHDR */
1360
1361         if (read_countp != NULL)
1362                 *read_countp = read_count;
1363 }
1364
1365 static void
1366 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1367                 isc_socketevent_t *dev)
1368 {
1369         if (sock->type == isc_sockettype_udp) {
1370                 if (address != NULL)
1371                         dev->address = *address;
1372                 else
1373                         dev->address = sock->peer_address;
1374         } else if (sock->type == isc_sockettype_tcp) {
1375                 INSIST(address == NULL);
1376                 dev->address = sock->peer_address;
1377         }
1378 }
1379
1380 static void
1381 destroy_socketevent(isc_event_t *event) {
1382         isc_socketevent_t *ev = (isc_socketevent_t *)event;
1383
1384         INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1385
1386         (ev->destroy)(event);
1387 }
1388
1389 static isc_socketevent_t *
1390 allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1391                      isc_taskaction_t action, const void *arg)
1392 {
1393         isc_socketevent_t *ev;
1394
1395         ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1396                                                      sock, eventtype,
1397                                                      action, arg,
1398                                                      sizeof(*ev));
1399
1400         if (ev == NULL)
1401                 return (NULL);
1402
1403         ev->result = ISC_R_UNEXPECTED;
1404         ISC_LINK_INIT(ev, ev_link);
1405         ISC_LIST_INIT(ev->bufferlist);
1406         ev->region.base = NULL;
1407         ev->n = 0;
1408         ev->offset = 0;
1409         ev->attributes = 0;
1410         ev->destroy = ev->ev_destroy;
1411         ev->ev_destroy = destroy_socketevent;
1412
1413         return (ev);
1414 }
1415
1416 #if defined(ISC_SOCKET_DEBUG)
1417 static void
1418 dump_msg(struct msghdr *msg) {
1419         unsigned int i;
1420
1421         printf("MSGHDR %p\n", msg);
1422         printf("\tname %p, namelen %ld\n", msg->msg_name,
1423                (long) msg->msg_namelen);
1424         printf("\tiov %p, iovlen %ld\n", msg->msg_iov,
1425                (long) msg->msg_iovlen);
1426         for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1427                 printf("\t\t%d\tbase %p, len %ld\n", i,
1428                        msg->msg_iov[i].iov_base,
1429                        (long) msg->msg_iov[i].iov_len);
1430 #ifdef ISC_NET_BSD44MSGHDR
1431         printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1432                (long) msg->msg_controllen);
1433 #endif
1434 }
1435 #endif
1436
1437 #define DOIO_SUCCESS            0       /* i/o ok, event sent */
1438 #define DOIO_SOFT               1       /* i/o ok, soft error, no event sent */
1439 #define DOIO_HARD               2       /* i/o error, event sent */
1440 #define DOIO_EOF                3       /* EOF, no event sent */
1441
1442 static int
1443 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1444         int cc;
1445         struct iovec iov[MAXSCATTERGATHER_RECV];
1446         size_t read_count;
1447         size_t actual_count;
1448         struct msghdr msghdr;
1449         isc_buffer_t *buffer;
1450         int recv_errno;
1451         char strbuf[ISC_STRERRORSIZE];
1452
1453         build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
1454
1455 #if defined(ISC_SOCKET_DEBUG)
1456         dump_msg(&msghdr);
1457 #endif
1458
1459         cc = recvmsg(sock->fd, &msghdr, 0);
1460         recv_errno = errno;
1461
1462 #if defined(ISC_SOCKET_DEBUG)
1463         dump_msg(&msghdr);
1464 #endif
1465
1466         if (cc < 0) {
1467                 if (SOFT_ERROR(recv_errno))
1468                         return (DOIO_SOFT);
1469
1470                 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1471                         isc__strerror(recv_errno, strbuf, sizeof(strbuf));
1472                         socket_log(sock, NULL, IOEVENT,
1473                                    isc_msgcat, ISC_MSGSET_SOCKET,
1474                                    ISC_MSG_DOIORECV,
1475                                   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1476                                    sock->fd, cc, recv_errno, strbuf);
1477                 }
1478
1479 #define SOFT_OR_HARD(_system, _isc) \
1480         if (recv_errno == _system) { \
1481                 if (sock->connected) { \
1482                         dev->result = _isc; \
1483                         inc_stats(sock->manager->stats, \
1484                                   sock->statsindex[STATID_RECVFAIL]); \
1485                         return (DOIO_HARD); \
1486                 } \
1487                 return (DOIO_SOFT); \
1488         }
1489 #define ALWAYS_HARD(_system, _isc) \
1490         if (recv_errno == _system) { \
1491                 dev->result = _isc; \
1492                 inc_stats(sock->manager->stats, \
1493                           sock->statsindex[STATID_RECVFAIL]); \
1494                 return (DOIO_HARD); \
1495         }
1496
1497                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1498                 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1499                 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1500                 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1501                 /* HPUX 11.11 can return EADDRNOTAVAIL. */
1502                 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1503                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1504                 /*
1505                  * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
1506                  * errors.
1507                  */
1508 #ifdef EPROTO
1509                 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1510 #endif
1511                 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1512
1513 #undef SOFT_OR_HARD
1514 #undef ALWAYS_HARD
1515
1516                 dev->result = isc__errno2result(recv_errno);
1517                 inc_stats(sock->manager->stats,
1518                           sock->statsindex[STATID_RECVFAIL]);
1519                 return (DOIO_HARD);
1520         }
1521
1522         /*
1523          * On TCP, zero length reads indicate EOF, while on
1524          * UDP, zero length reads are perfectly valid, although
1525          * strange.
1526          */
1527         if ((sock->type == isc_sockettype_tcp) && (cc == 0))
1528                 return (DOIO_EOF);
1529
1530         if (sock->type == isc_sockettype_udp) {
1531                 dev->address.length = msghdr.msg_namelen;
1532                 if (isc_sockaddr_getport(&dev->address) == 0) {
1533                         if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1534                                 socket_log(sock, &dev->address, IOEVENT,
1535                                            isc_msgcat, ISC_MSGSET_SOCKET,
1536                                            ISC_MSG_ZEROPORT,
1537                                            "dropping source port zero packet");
1538                         }
1539                         return (DOIO_SOFT);
1540                 }
1541         }
1542
1543         socket_log(sock, &dev->address, IOEVENT,
1544                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1545                    "packet received correctly");
1546
1547         /*
1548          * Overflow bit detection.  If we received MORE bytes than we should,
1549          * this indicates an overflow situation.  Set the flag in the
1550          * dev entry and adjust how much we read by one.
1551          */
1552 #ifdef ISC_NET_RECVOVERFLOW
1553         if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1554                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1555                 cc--;
1556         }
1557 #endif
1558
1559         /*
1560          * If there are control messages attached, run through them and pull
1561          * out the interesting bits.
1562          */
1563         if (sock->type == isc_sockettype_udp)
1564                 process_cmsg(sock, &msghdr, dev);
1565
1566         /*
1567          * update the buffers (if any) and the i/o count
1568          */
1569         dev->n += cc;
1570         actual_count = cc;
1571         buffer = ISC_LIST_HEAD(dev->bufferlist);
1572         while (buffer != NULL && actual_count > 0U) {
1573                 REQUIRE(ISC_BUFFER_VALID(buffer));
1574                 if (isc_buffer_availablelength(buffer) <= actual_count) {
1575                         actual_count -= isc_buffer_availablelength(buffer);
1576                         isc_buffer_add(buffer,
1577                                        isc_buffer_availablelength(buffer));
1578                 } else {
1579                         isc_buffer_add(buffer, actual_count);
1580                         actual_count = 0;
1581                         break;
1582                 }
1583                 buffer = ISC_LIST_NEXT(buffer, link);
1584                 if (buffer == NULL) {
1585                         INSIST(actual_count == 0U);
1586                 }
1587         }
1588
1589         /*
1590          * If we read less than we expected, update counters,
1591          * and let the upper layer poke the descriptor.
1592          */
1593         if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1594                 return (DOIO_SOFT);
1595
1596         /*
1597          * Full reads are posted, or partials if partials are ok.
1598          */
1599         dev->result = ISC_R_SUCCESS;
1600         return (DOIO_SUCCESS);
1601 }
1602
1603 /*
1604  * Returns:
1605  *      DOIO_SUCCESS    The operation succeeded.  dev->result contains
1606  *                      ISC_R_SUCCESS.
1607  *
1608  *      DOIO_HARD       A hard or unexpected I/O error was encountered.
1609  *                      dev->result contains the appropriate error.
1610  *
1611  *      DOIO_SOFT       A soft I/O error was encountered.  No senddone
1612  *                      event was sent.  The operation should be retried.
1613  *
1614  *      No other return values are possible.
1615  */
1616 static int
1617 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1618         int cc;
1619         struct iovec iov[MAXSCATTERGATHER_SEND];
1620         size_t write_count;
1621         struct msghdr msghdr;
1622         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1623         int attempts = 0;
1624         int send_errno;
1625         char strbuf[ISC_STRERRORSIZE];
1626
1627         build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1628
1629  resend:
1630         cc = sendmsg(sock->fd, &msghdr, 0);
1631         send_errno = errno;
1632
1633         /*
1634          * Check for error or block condition.
1635          */
1636         if (cc < 0) {
1637                 if (send_errno == EINTR && ++attempts < NRETRIES)
1638                         goto resend;
1639
1640                 if (SOFT_ERROR(send_errno))
1641                         return (DOIO_SOFT);
1642
1643 #define SOFT_OR_HARD(_system, _isc) \
1644         if (send_errno == _system) { \
1645                 if (sock->connected) { \
1646                         dev->result = _isc; \
1647                         inc_stats(sock->manager->stats, \
1648                                   sock->statsindex[STATID_SENDFAIL]); \
1649                         return (DOIO_HARD); \
1650                 } \
1651                 return (DOIO_SOFT); \
1652         }
1653 #define ALWAYS_HARD(_system, _isc) \
1654         if (send_errno == _system) { \
1655                 dev->result = _isc; \
1656                 inc_stats(sock->manager->stats, \
1657                           sock->statsindex[STATID_SENDFAIL]); \
1658                 return (DOIO_HARD); \
1659         }
1660
1661                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1662                 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1663                 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1664                 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1665                 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1666 #ifdef EHOSTDOWN
1667                 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1668 #endif
1669                 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1670                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1671                 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1672                 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1673                 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1674
1675 #undef SOFT_OR_HARD
1676 #undef ALWAYS_HARD
1677
1678                 /*
1679                  * The other error types depend on whether or not the
1680                  * socket is UDP or TCP.  If it is UDP, some errors
1681                  * that we expect to be fatal under TCP are merely
1682                  * annoying, and are really soft errors.
1683                  *
1684                  * However, these soft errors are still returned as
1685                  * a status.
1686                  */
1687                 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1688                 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1689                 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1690                                  addrbuf, strbuf);
1691                 dev->result = isc__errno2result(send_errno);
1692                 inc_stats(sock->manager->stats,
1693                           sock->statsindex[STATID_SENDFAIL]);
1694                 return (DOIO_HARD);
1695         }
1696
1697         if (cc == 0) {
1698                 inc_stats(sock->manager->stats,
1699                           sock->statsindex[STATID_SENDFAIL]);
1700                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1701                                  "doio_send: send() %s 0",
1702                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1703                                                 ISC_MSG_RETURNED, "returned"));
1704         }
1705
1706         /*
1707          * If we write less than we expected, update counters, poke.
1708          */
1709         dev->n += cc;
1710         if ((size_t)cc != write_count)
1711                 return (DOIO_SOFT);
1712
1713         /*
1714          * Exactly what we wanted to write.  We're done with this
1715          * entry.  Post its completion event.
1716          */
1717         dev->result = ISC_R_SUCCESS;
1718         return (DOIO_SUCCESS);
1719 }
1720
1721 /*
1722  * Kill.
1723  *
1724  * Caller must ensure that the socket is not locked and no external
1725  * references exist.
1726  */
1727 static void
1728 closesocket(isc_socketmgr_t *manager, isc_socket_t *sock, int fd) {
1729         isc_sockettype_t type = sock->type;
1730         int lockid = FDLOCK_ID(fd);
1731
1732         /*
1733          * No one has this socket open, so the watcher doesn't have to be
1734          * poked, and the socket doesn't have to be locked.
1735          */
1736         LOCK(&manager->fdlock[lockid]);
1737         manager->fds[fd] = NULL;
1738         if (type == isc_sockettype_fdwatch)
1739                 manager->fdstate[fd] = CLOSED;
1740         else
1741                 manager->fdstate[fd] = CLOSE_PENDING;
1742         UNLOCK(&manager->fdlock[lockid]);
1743         if (type == isc_sockettype_fdwatch) {
1744                 /*
1745                  * The caller may close the socket once this function returns,
1746                  * and `fd' may be reassigned for a new socket.  So we do
1747                  * unwatch_fd() here, rather than defer it via select_poke().
1748                  * Note: this may complicate data protection among threads and
1749                  * may reduce performance due to additional locks.  One way to
1750                  * solve this would be to dup() the watched descriptor, but we
1751                  * take a simpler approach at this moment.
1752                  */
1753                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1754                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1755         } else
1756                 select_poke(manager, fd, SELECT_POKE_CLOSE);
1757
1758         inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]);
1759
1760         /*
1761          * update manager->maxfd here (XXX: this should be implemented more
1762          * efficiently)
1763          */
1764 #ifdef USE_SELECT
1765         LOCK(&manager->lock);
1766         if (manager->maxfd == fd) {
1767                 int i;
1768
1769                 manager->maxfd = 0;
1770                 for (i = fd - 1; i >= 0; i--) {
1771                         lockid = FDLOCK_ID(i);
1772
1773                         LOCK(&manager->fdlock[lockid]);
1774                         if (manager->fdstate[i] == MANAGED) {
1775                                 manager->maxfd = i;
1776                                 UNLOCK(&manager->fdlock[lockid]);
1777                                 break;
1778                         }
1779                         UNLOCK(&manager->fdlock[lockid]);
1780                 }
1781 #ifdef ISC_PLATFORM_USETHREADS
1782                 if (manager->maxfd < manager->pipe_fds[0])
1783                         manager->maxfd = manager->pipe_fds[0];
1784 #endif
1785         }
1786         UNLOCK(&manager->lock);
1787 #endif  /* USE_SELECT */
1788 }
1789
1790 static void
1791 destroy(isc_socket_t **sockp) {
1792         int fd;
1793         isc_socket_t *sock = *sockp;
1794         isc_socketmgr_t *manager = sock->manager;
1795
1796         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1797                    ISC_MSG_DESTROYING, "destroying");
1798
1799         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1800         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1801         INSIST(ISC_LIST_EMPTY(sock->send_list));
1802         INSIST(sock->connect_ev == NULL);
1803         REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks);
1804
1805         if (sock->fd >= 0) {
1806                 fd = sock->fd;
1807                 sock->fd = -1;
1808                 closesocket(manager, sock, fd);
1809         }
1810
1811         LOCK(&manager->lock);
1812
1813         ISC_LIST_UNLINK(manager->socklist, sock, link);
1814
1815 #ifdef ISC_PLATFORM_USETHREADS
1816         if (ISC_LIST_EMPTY(manager->socklist))
1817                 SIGNAL(&manager->shutdown_ok);
1818 #endif /* ISC_PLATFORM_USETHREADS */
1819
1820         UNLOCK(&manager->lock);
1821
1822         free_socket(sockp);
1823 }
1824
1825 static isc_result_t
1826 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1827                 isc_socket_t **socketp)
1828 {
1829         isc_socket_t *sock;
1830         isc_result_t result;
1831         ISC_SOCKADDR_LEN_T cmsgbuflen;
1832
1833         sock = isc_mem_get(manager->mctx, sizeof(*sock));
1834
1835         if (sock == NULL)
1836                 return (ISC_R_NOMEMORY);
1837
1838         result = ISC_R_UNEXPECTED;
1839
1840         sock->magic = 0;
1841         sock->references = 0;
1842
1843         sock->manager = manager;
1844         sock->type = type;
1845         sock->fd = -1;
1846         sock->statsindex = NULL;
1847
1848         ISC_LINK_INIT(sock, link);
1849
1850         sock->recvcmsgbuf = NULL;
1851         sock->sendcmsgbuf = NULL;
1852
1853         /*
1854          * set up cmsg buffers
1855          */
1856         cmsgbuflen = 0;
1857 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1858         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1859 #endif
1860 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
1861         cmsgbuflen += cmsg_space(sizeof(struct timeval));
1862 #endif
1863         sock->recvcmsgbuflen = cmsgbuflen;
1864         if (sock->recvcmsgbuflen != 0U) {
1865                 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1866                 if (sock->recvcmsgbuf == NULL)
1867                         goto error;
1868         }
1869
1870         cmsgbuflen = 0;
1871 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1872         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1873 #endif
1874         sock->sendcmsgbuflen = cmsgbuflen;
1875         if (sock->sendcmsgbuflen != 0U) {
1876                 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1877                 if (sock->sendcmsgbuf == NULL)
1878                         goto error;
1879         }
1880
1881         memset(sock->name, 0, sizeof(sock->name));
1882         sock->tag = NULL;
1883
1884         /*
1885          * set up list of readers and writers to be initially empty
1886          */
1887         ISC_LIST_INIT(sock->recv_list);
1888         ISC_LIST_INIT(sock->send_list);
1889         ISC_LIST_INIT(sock->accept_list);
1890         sock->connect_ev = NULL;
1891         sock->pending_recv = 0;
1892         sock->pending_send = 0;
1893         sock->pending_accept = 0;
1894         sock->listener = 0;
1895         sock->connected = 0;
1896         sock->connecting = 0;
1897         sock->bound = 0;
1898
1899         /*
1900          * initialize the lock
1901          */
1902         result = isc_mutex_init(&sock->lock);
1903         if (result != ISC_R_SUCCESS) {
1904                 sock->magic = 0;
1905                 goto error;
1906         }
1907
1908         /*
1909          * Initialize readable and writable events
1910          */
1911         ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1912                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1913                        NULL, sock, sock, NULL, NULL);
1914         ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1915                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1916                        NULL, sock, sock, NULL, NULL);
1917
1918         sock->magic = SOCKET_MAGIC;
1919         *socketp = sock;
1920
1921         return (ISC_R_SUCCESS);
1922
1923  error:
1924         if (sock->recvcmsgbuf != NULL)
1925                 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
1926                             sock->recvcmsgbuflen);
1927         if (sock->sendcmsgbuf != NULL)
1928                 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
1929                             sock->sendcmsgbuflen);
1930         isc_mem_put(manager->mctx, sock, sizeof(*sock));
1931
1932         return (result);
1933 }
1934
1935 /*
1936  * This event requires that the various lists be empty, that the reference
1937  * count be 1, and that the magic number is valid.  The other socket bits,
1938  * like the lock, must be initialized as well.  The fd associated must be
1939  * marked as closed, by setting it to -1 on close, or this routine will
1940  * also close the socket.
1941  */
1942 static void
1943 free_socket(isc_socket_t **socketp) {
1944         isc_socket_t *sock = *socketp;
1945
1946         INSIST(sock->references == 0);
1947         INSIST(VALID_SOCKET(sock));
1948         INSIST(!sock->connecting);
1949         INSIST(!sock->pending_recv);
1950         INSIST(!sock->pending_send);
1951         INSIST(!sock->pending_accept);
1952         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1953         INSIST(ISC_LIST_EMPTY(sock->send_list));
1954         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1955         INSIST(!ISC_LINK_LINKED(sock, link));
1956
1957         if (sock->recvcmsgbuf != NULL)
1958                 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
1959                             sock->recvcmsgbuflen);
1960         if (sock->sendcmsgbuf != NULL)
1961                 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
1962                             sock->sendcmsgbuflen);
1963
1964         sock->magic = 0;
1965
1966         DESTROYLOCK(&sock->lock);
1967
1968         isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1969
1970         *socketp = NULL;
1971 }
1972
1973 #ifdef SO_BSDCOMPAT
1974 /*
1975  * This really should not be necessary to do.  Having to workout
1976  * which kernel version we are on at run time so that we don't cause
1977  * the kernel to issue a warning about us using a deprecated socket option.
1978  * Such warnings should *never* be on by default in production kernels.
1979  *
1980  * We can't do this a build time because executables are moved between
1981  * machines and hence kernels.
1982  *
1983  * We can't just not set SO_BSDCOMAT because some kernels require it.
1984  */
1985
1986 static isc_once_t         bsdcompat_once = ISC_ONCE_INIT;
1987 isc_boolean_t bsdcompat = ISC_TRUE;
1988
1989 static void
1990 clear_bsdcompat(void) {
1991 #ifdef __linux__
1992          struct utsname buf;
1993          char *endp;
1994          long int major;
1995          long int minor;
1996
1997          uname(&buf);    /* Can only fail if buf is bad in Linux. */
1998
1999          /* Paranoia in parsing can be increased, but we trust uname(). */
2000          major = strtol(buf.release, &endp, 10);
2001          if (*endp == '.') {
2002                 minor = strtol(endp+1, &endp, 10);
2003                 if ((major > 2) || ((major == 2) && (minor >= 4))) {
2004                         bsdcompat = ISC_FALSE;
2005                 }
2006          }
2007 #endif /* __linux __ */
2008 }
2009 #endif
2010
2011 static isc_result_t
2012 opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) {
2013         char strbuf[ISC_STRERRORSIZE];
2014         const char *err = "socket";
2015         int tries = 0;
2016 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
2017         int on = 1;
2018 #endif
2019 #if defined(SO_RCVBUF)
2020         ISC_SOCKADDR_LEN_T optlen;
2021         int size;
2022 #endif
2023
2024  again:
2025         switch (sock->type) {
2026         case isc_sockettype_udp:
2027                 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2028                 break;
2029         case isc_sockettype_tcp:
2030                 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2031                 break;
2032         case isc_sockettype_unix:
2033                 sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2034                 break;
2035         case isc_sockettype_fdwatch:
2036                 /*
2037                  * We should not be called for isc_sockettype_fdwatch sockets.
2038                  */
2039                 INSIST(0);
2040                 break;
2041         }
2042         if (sock->fd == -1 && errno == EINTR && tries++ < 42)
2043                 goto again;
2044
2045 #ifdef F_DUPFD
2046         /*
2047          * Leave a space for stdio and TCP to work in.
2048          */
2049         if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2050             sock->fd >= 0 && sock->fd < manager->reserved) {
2051                 int new, tmp;
2052                 new = fcntl(sock->fd, F_DUPFD, manager->reserved);
2053                 tmp = errno;
2054                 (void)close(sock->fd);
2055                 errno = tmp;
2056                 sock->fd = new;
2057                 err = "isc_socket_create: fcntl/reserved";
2058         } else if (sock->fd >= 0 && sock->fd < 20) {
2059                 int new, tmp;
2060                 new = fcntl(sock->fd, F_DUPFD, 20);
2061                 tmp = errno;
2062                 (void)close(sock->fd);
2063                 errno = tmp;
2064                 sock->fd = new;
2065                 err = "isc_socket_create: fcntl";
2066         }
2067 #endif
2068
2069         if (sock->fd >= (int)manager->maxsocks) {
2070                 (void)close(sock->fd);
2071                 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2072                                ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2073                                isc_msgcat, ISC_MSGSET_SOCKET,
2074                                ISC_MSG_TOOMANYFDS,
2075                                "socket: file descriptor exceeds limit (%d/%u)",
2076                                sock->fd, manager->maxsocks);
2077                 return (ISC_R_NORESOURCES);
2078         }
2079
2080         if (sock->fd < 0) {
2081                 switch (errno) {
2082                 case EMFILE:
2083                 case ENFILE:
2084                         isc__strerror(errno, strbuf, sizeof(strbuf));
2085                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2086                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2087                                        isc_msgcat, ISC_MSGSET_SOCKET,
2088                                        ISC_MSG_TOOMANYFDS,
2089                                        "%s: %s", err, strbuf);
2090                         /* fallthrough */
2091                 case ENOBUFS:
2092                         return (ISC_R_NORESOURCES);
2093
2094                 case EPROTONOSUPPORT:
2095                 case EPFNOSUPPORT:
2096                 case EAFNOSUPPORT:
2097                 /*
2098                  * Linux 2.2 (and maybe others) return EINVAL instead of
2099                  * EAFNOSUPPORT.
2100                  */
2101                 case EINVAL:
2102                         return (ISC_R_FAMILYNOSUPPORT);
2103
2104                 default:
2105                         isc__strerror(errno, strbuf, sizeof(strbuf));
2106                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2107                                          "%s() %s: %s", err,
2108                                          isc_msgcat_get(isc_msgcat,
2109                                                         ISC_MSGSET_GENERAL,
2110                                                         ISC_MSG_FAILED,
2111                                                         "failed"),
2112                                          strbuf);
2113                         return (ISC_R_UNEXPECTED);
2114                 }
2115         }
2116
2117         if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
2118                 (void)close(sock->fd);
2119                 return (ISC_R_UNEXPECTED);
2120         }
2121
2122 #ifdef SO_BSDCOMPAT
2123         RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
2124                                   clear_bsdcompat) == ISC_R_SUCCESS);
2125         if (sock->type != isc_sockettype_unix && bsdcompat &&
2126             setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
2127                        (void *)&on, sizeof(on)) < 0) {
2128                 isc__strerror(errno, strbuf, sizeof(strbuf));
2129                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2130                                  "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
2131                                  sock->fd,
2132                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2133                                                 ISC_MSG_FAILED, "failed"),
2134                                  strbuf);
2135                 /* Press on... */
2136         }
2137 #endif
2138
2139 #ifdef SO_NOSIGPIPE
2140         if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE,
2141                        (void *)&on, sizeof(on)) < 0) {
2142                 isc__strerror(errno, strbuf, sizeof(strbuf));
2143                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2144                                  "setsockopt(%d, SO_NOSIGPIPE) %s: %s",
2145                                  sock->fd,
2146                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2147                                                 ISC_MSG_FAILED, "failed"),
2148                                  strbuf);
2149                 /* Press on... */
2150         }
2151 #endif
2152
2153 #if defined(USE_CMSG) || defined(SO_RCVBUF)
2154         if (sock->type == isc_sockettype_udp) {
2155
2156 #if defined(USE_CMSG)
2157 #if defined(SO_TIMESTAMP)
2158                 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
2159                                (void *)&on, sizeof(on)) < 0
2160                     && errno != ENOPROTOOPT) {
2161                         isc__strerror(errno, strbuf, sizeof(strbuf));
2162                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2163                                          "setsockopt(%d, SO_TIMESTAMP) %s: %s",
2164                                          sock->fd,
2165                                          isc_msgcat_get(isc_msgcat,
2166                                                         ISC_MSGSET_GENERAL,
2167                                                         ISC_MSG_FAILED,
2168                                                         "failed"),
2169                                          strbuf);
2170                         /* Press on... */
2171                 }
2172 #endif /* SO_TIMESTAMP */
2173
2174 #if defined(ISC_PLATFORM_HAVEIPV6)
2175                 if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
2176                         /*
2177                          * Warn explicitly because this anomaly can be hidden
2178                          * in usual operation (and unexpectedly appear later).
2179                          */
2180                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2181                                          "No buffer available to receive "
2182                                          "IPv6 destination");
2183                 }
2184 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
2185 #ifdef IPV6_RECVPKTINFO
2186                 /* RFC 3542 */
2187                 if ((sock->pf == AF_INET6)
2188                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2189                                    (void *)&on, sizeof(on)) < 0)) {
2190                         isc__strerror(errno, strbuf, sizeof(strbuf));
2191                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2192                                          "setsockopt(%d, IPV6_RECVPKTINFO) "
2193                                          "%s: %s", sock->fd,
2194                                          isc_msgcat_get(isc_msgcat,
2195                                                         ISC_MSGSET_GENERAL,
2196                                                         ISC_MSG_FAILED,
2197                                                         "failed"),
2198                                          strbuf);
2199                 }
2200 #else
2201                 /* RFC 2292 */
2202                 if ((sock->pf == AF_INET6)
2203                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2204                                    (void *)&on, sizeof(on)) < 0)) {
2205                         isc__strerror(errno, strbuf, sizeof(strbuf));
2206                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2207                                          "setsockopt(%d, IPV6_PKTINFO) %s: %s",
2208                                          sock->fd,
2209                                          isc_msgcat_get(isc_msgcat,
2210                                                         ISC_MSGSET_GENERAL,
2211                                                         ISC_MSG_FAILED,
2212                                                         "failed"),
2213                                          strbuf);
2214                 }
2215 #endif /* IPV6_RECVPKTINFO */
2216 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
2217 #ifdef IPV6_USE_MIN_MTU        /* RFC 3542, not too common yet*/
2218                 /* use minimum MTU */
2219                 if (sock->pf == AF_INET6) {
2220                         (void)setsockopt(sock->fd, IPPROTO_IPV6,
2221                                          IPV6_USE_MIN_MTU,
2222                                          (void *)&on, sizeof(on));
2223                 }
2224 #endif
2225 #endif /* ISC_PLATFORM_HAVEIPV6 */
2226 #endif /* defined(USE_CMSG) */
2227
2228 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
2229                 /*
2230                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2231                  */
2232                 if (sock->pf == AF_INET) {
2233                         int action = IP_PMTUDISC_DONT;
2234                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2235                                          &action, sizeof(action));
2236                 }
2237 #endif
2238 #if defined(IP_DONTFRAG)
2239                 /*
2240                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2241                  */
2242                 if (sock->pf == AF_INET) {
2243                         int off = 0;
2244                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG,
2245                                          &off, sizeof(off));
2246                 }
2247 #endif
2248
2249 #if defined(SO_RCVBUF)
2250                 optlen = sizeof(size);
2251                 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2252                                (void *)&size, &optlen) >= 0 &&
2253                      size < RCVBUFSIZE) {
2254                         size = RCVBUFSIZE;
2255                         if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2256                                        (void *)&size, sizeof(size)) == -1) {
2257                                 isc__strerror(errno, strbuf, sizeof(strbuf));
2258                                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2259                                         "setsockopt(%d, SO_RCVBUF, %d) %s: %s",
2260                                         sock->fd, size,
2261                                         isc_msgcat_get(isc_msgcat,
2262                                                        ISC_MSGSET_GENERAL,
2263                                                        ISC_MSG_FAILED,
2264                                                        "failed"),
2265                                         strbuf);
2266                         }
2267                 }
2268 #endif
2269         }
2270 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
2271
2272         inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2273
2274         return (ISC_R_SUCCESS);
2275 }
2276
2277 /*%
2278  * Create a new 'type' socket managed by 'manager'.  Events
2279  * will be posted to 'task' and when dispatched 'action' will be
2280  * called with 'arg' as the arg value.  The new socket is returned
2281  * in 'socketp'.
2282  */
2283 isc_result_t
2284 isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
2285                   isc_socket_t **socketp)
2286 {
2287         isc_socket_t *sock = NULL;
2288         isc_result_t result;
2289         int lockid;
2290
2291         REQUIRE(VALID_MANAGER(manager));
2292         REQUIRE(socketp != NULL && *socketp == NULL);
2293         REQUIRE(type != isc_sockettype_fdwatch);
2294
2295         result = allocate_socket(manager, type, &sock);
2296         if (result != ISC_R_SUCCESS)
2297                 return (result);
2298
2299         switch (sock->type) {
2300         case isc_sockettype_udp:
2301                 sock->statsindex =
2302                         (pf == AF_INET) ? upd4statsindex : upd6statsindex;
2303                 break;
2304         case isc_sockettype_tcp:
2305                 sock->statsindex =
2306                         (pf == AF_INET) ? tcp4statsindex : tcp6statsindex;
2307                 break;
2308         case isc_sockettype_unix:
2309                 sock->statsindex = unixstatsindex;
2310                 break;
2311         default:
2312                 INSIST(0);
2313         }
2314
2315         sock->pf = pf;
2316         result = opensocket(manager, sock);
2317         if (result != ISC_R_SUCCESS) {
2318                 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2319                 free_socket(&sock);
2320                 return (result);
2321         }
2322
2323         sock->references = 1;
2324         *socketp = sock;
2325
2326         /*
2327          * Note we don't have to lock the socket like we normally would because
2328          * there are no external references to it yet.
2329          */
2330
2331         lockid = FDLOCK_ID(sock->fd);
2332         LOCK(&manager->fdlock[lockid]);
2333         manager->fds[sock->fd] = sock;
2334         manager->fdstate[sock->fd] = MANAGED;
2335 #ifdef USE_DEVPOLL
2336         INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2337                sock->manager->fdpollinfo[sock->fd].want_write == 0);
2338 #endif
2339         UNLOCK(&manager->fdlock[lockid]);
2340
2341         LOCK(&manager->lock);
2342         ISC_LIST_APPEND(manager->socklist, sock, link);
2343 #ifdef USE_SELECT
2344         if (manager->maxfd < sock->fd)
2345                 manager->maxfd = sock->fd;
2346 #endif
2347         UNLOCK(&manager->lock);
2348
2349         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2350                    ISC_MSG_CREATED, "created");
2351
2352         return (ISC_R_SUCCESS);
2353 }
2354
2355 isc_result_t
2356 isc_socket_open(isc_socket_t *sock) {
2357         isc_result_t result;
2358
2359         REQUIRE(VALID_SOCKET(sock));
2360
2361         LOCK(&sock->lock);
2362         REQUIRE(sock->references == 1);
2363         REQUIRE(sock->type != isc_sockettype_fdwatch);
2364         UNLOCK(&sock->lock);
2365         /*
2366          * We don't need to retain the lock hereafter, since no one else has
2367          * this socket.
2368          */
2369         REQUIRE(sock->fd == -1);
2370
2371         result = opensocket(sock->manager, sock);
2372         if (result != ISC_R_SUCCESS)
2373                 sock->fd = -1;
2374
2375         if (result == ISC_R_SUCCESS) {
2376                 int lockid = FDLOCK_ID(sock->fd);
2377
2378                 LOCK(&sock->manager->fdlock[lockid]);
2379                 sock->manager->fds[sock->fd] = sock;
2380                 sock->manager->fdstate[sock->fd] = MANAGED;
2381 #ifdef USE_DEVPOLL
2382                 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2383                        sock->manager->fdpollinfo[sock->fd].want_write == 0);
2384 #endif
2385                 UNLOCK(&sock->manager->fdlock[lockid]);
2386
2387 #ifdef USE_SELECT
2388                 LOCK(&sock->manager->lock);
2389                 if (sock->manager->maxfd < sock->fd)
2390                         sock->manager->maxfd = sock->fd;
2391                 UNLOCK(&sock->manager->lock);
2392 #endif
2393         }
2394
2395         return (result);
2396 }
2397
2398 /*
2399  * Create a new 'type' socket managed by 'manager'.  Events
2400  * will be posted to 'task' and when dispatched 'action' will be
2401  * called with 'arg' as the arg value.  The new socket is returned
2402  * in 'socketp'.
2403  */
2404 isc_result_t
2405 isc_socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
2406                          isc_sockfdwatch_t callback, void *cbarg,
2407                          isc_task_t *task, isc_socket_t **socketp)
2408 {
2409         isc_socket_t *sock = NULL;
2410         isc_result_t result;
2411         int lockid;
2412
2413         REQUIRE(VALID_MANAGER(manager));
2414         REQUIRE(socketp != NULL && *socketp == NULL);
2415
2416         result = allocate_socket(manager, isc_sockettype_fdwatch, &sock);
2417         if (result != ISC_R_SUCCESS)
2418                 return (result);
2419
2420         sock->fd = fd;
2421         sock->fdwatcharg = cbarg;
2422         sock->fdwatchcb = callback;
2423         sock->fdwatchflags = flags;
2424         sock->fdwatchtask = task;
2425         sock->statsindex = fdwatchstatsindex;
2426
2427         sock->references = 1;
2428         *socketp = sock;
2429
2430         /*
2431          * Note we don't have to lock the socket like we normally would because
2432          * there are no external references to it yet.
2433          */
2434
2435         lockid = FDLOCK_ID(sock->fd);
2436         LOCK(&manager->fdlock[lockid]);
2437         manager->fds[sock->fd] = sock;
2438         manager->fdstate[sock->fd] = MANAGED;
2439         UNLOCK(&manager->fdlock[lockid]);
2440
2441         LOCK(&manager->lock);
2442         ISC_LIST_APPEND(manager->socklist, sock, link);
2443 #ifdef USE_SELECT
2444         if (manager->maxfd < sock->fd)
2445                 manager->maxfd = sock->fd;
2446 #endif
2447         UNLOCK(&manager->lock);
2448
2449         if (flags & ISC_SOCKFDWATCH_READ)
2450                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2451         if (flags & ISC_SOCKFDWATCH_WRITE)
2452                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2453
2454         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2455                    ISC_MSG_CREATED, "fdwatch-created");
2456
2457         return (ISC_R_SUCCESS);
2458 }
2459
2460 /*
2461  * Attach to a socket.  Caller must explicitly detach when it is done.
2462  */
2463 void
2464 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
2465         REQUIRE(VALID_SOCKET(sock));
2466         REQUIRE(socketp != NULL && *socketp == NULL);
2467
2468         LOCK(&sock->lock);
2469         sock->references++;
2470         UNLOCK(&sock->lock);
2471
2472         *socketp = sock;
2473 }
2474
2475 /*
2476  * Dereference a socket.  If this is the last reference to it, clean things
2477  * up by destroying the socket.
2478  */
2479 void
2480 isc_socket_detach(isc_socket_t **socketp) {
2481         isc_socket_t *sock;
2482         isc_boolean_t kill_socket = ISC_FALSE;
2483
2484         REQUIRE(socketp != NULL);
2485         sock = *socketp;
2486         REQUIRE(VALID_SOCKET(sock));
2487
2488         LOCK(&sock->lock);
2489         REQUIRE(sock->references > 0);
2490         sock->references--;
2491         if (sock->references == 0)
2492                 kill_socket = ISC_TRUE;
2493         UNLOCK(&sock->lock);
2494
2495         if (kill_socket)
2496                 destroy(&sock);
2497
2498         *socketp = NULL;
2499 }
2500
2501 isc_result_t
2502 isc_socket_close(isc_socket_t *sock) {
2503         int fd;
2504         isc_socketmgr_t *manager;
2505         isc_sockettype_t type;
2506
2507         REQUIRE(VALID_SOCKET(sock));
2508
2509         LOCK(&sock->lock);
2510
2511         REQUIRE(sock->references == 1);
2512         REQUIRE(sock->type != isc_sockettype_fdwatch);
2513         REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2514
2515         INSIST(!sock->connecting);
2516         INSIST(!sock->pending_recv);
2517         INSIST(!sock->pending_send);
2518         INSIST(!sock->pending_accept);
2519         INSIST(ISC_LIST_EMPTY(sock->recv_list));
2520         INSIST(ISC_LIST_EMPTY(sock->send_list));
2521         INSIST(ISC_LIST_EMPTY(sock->accept_list));
2522         INSIST(sock->connect_ev == NULL);
2523
2524         manager = sock->manager;
2525         type = sock->type;
2526         fd = sock->fd;
2527         sock->fd = -1;
2528         memset(sock->name, 0, sizeof(sock->name));
2529         sock->tag = NULL;
2530         sock->listener = 0;
2531         sock->connected = 0;
2532         sock->connecting = 0;
2533         sock->bound = 0;
2534         isc_sockaddr_any(&sock->peer_address);
2535
2536         UNLOCK(&sock->lock);
2537
2538         closesocket(manager, sock, fd);
2539
2540         return (ISC_R_SUCCESS);
2541 }
2542
2543 /*
2544  * I/O is possible on a given socket.  Schedule an event to this task that
2545  * will call an internal function to do the I/O.  This will charge the
2546  * task with the I/O operation and let our select loop handler get back
2547  * to doing something real as fast as possible.
2548  *
2549  * The socket and manager must be locked before calling this function.
2550  */
2551 static void
2552 dispatch_recv(isc_socket_t *sock) {
2553         intev_t *iev;
2554         isc_socketevent_t *ev;
2555         isc_task_t *sender;
2556
2557         INSIST(!sock->pending_recv);
2558
2559         if (sock->type != isc_sockettype_fdwatch) {
2560                 ev = ISC_LIST_HEAD(sock->recv_list);
2561                 if (ev == NULL)
2562                         return;
2563                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2564                            "dispatch_recv:  event %p -> task %p",
2565                            ev, ev->ev_sender);
2566                 sender = ev->ev_sender;
2567         } else {
2568                 sender = sock->fdwatchtask;
2569         }
2570
2571         sock->pending_recv = 1;
2572         iev = &sock->readable_ev;
2573
2574         sock->references++;
2575         iev->ev_sender = sock;
2576         if (sock->type == isc_sockettype_fdwatch)
2577                 iev->ev_action = internal_fdwatch_read;
2578         else
2579                 iev->ev_action = internal_recv;
2580         iev->ev_arg = sock;
2581
2582         isc_task_send(sender, (isc_event_t **)&iev);
2583 }
2584
2585 static void
2586 dispatch_send(isc_socket_t *sock) {
2587         intev_t *iev;
2588         isc_socketevent_t *ev;
2589         isc_task_t *sender;
2590
2591         INSIST(!sock->pending_send);
2592
2593         if (sock->type != isc_sockettype_fdwatch) {
2594                 ev = ISC_LIST_HEAD(sock->send_list);
2595                 if (ev == NULL)
2596                         return;
2597                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2598                            "dispatch_send:  event %p -> task %p",
2599                            ev, ev->ev_sender);
2600                 sender = ev->ev_sender;
2601         } else {
2602                 sender = sock->fdwatchtask;
2603         }
2604
2605         sock->pending_send = 1;
2606         iev = &sock->writable_ev;
2607
2608         sock->references++;
2609         iev->ev_sender = sock;
2610         if (sock->type == isc_sockettype_fdwatch)
2611                 iev->ev_action = internal_fdwatch_write;
2612         else
2613                 iev->ev_action = internal_send;
2614         iev->ev_arg = sock;
2615
2616         isc_task_send(sender, (isc_event_t **)&iev);
2617 }
2618
2619 /*
2620  * Dispatch an internal accept event.
2621  */
2622 static void
2623 dispatch_accept(isc_socket_t *sock) {
2624         intev_t *iev;
2625         isc_socket_newconnev_t *ev;
2626
2627         INSIST(!sock->pending_accept);
2628
2629         /*
2630          * Are there any done events left, or were they all canceled
2631          * before the manager got the socket lock?
2632          */
2633         ev = ISC_LIST_HEAD(sock->accept_list);
2634         if (ev == NULL)
2635                 return;
2636
2637         sock->pending_accept = 1;
2638         iev = &sock->readable_ev;
2639
2640         sock->references++;  /* keep socket around for this internal event */
2641         iev->ev_sender = sock;
2642         iev->ev_action = internal_accept;
2643         iev->ev_arg = sock;
2644
2645         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2646 }
2647
2648 static void
2649 dispatch_connect(isc_socket_t *sock) {
2650         intev_t *iev;
2651         isc_socket_connev_t *ev;
2652
2653         iev = &sock->writable_ev;
2654
2655         ev = sock->connect_ev;
2656         INSIST(ev != NULL); /* XXX */
2657
2658         INSIST(sock->connecting);
2659
2660         sock->references++;  /* keep socket around for this internal event */
2661         iev->ev_sender = sock;
2662         iev->ev_action = internal_connect;
2663         iev->ev_arg = sock;
2664
2665         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2666 }
2667
2668 /*
2669  * Dequeue an item off the given socket's read queue, set the result code
2670  * in the done event to the one provided, and send it to the task it was
2671  * destined for.
2672  *
2673  * If the event to be sent is on a list, remove it before sending.  If
2674  * asked to, send and detach from the socket as well.
2675  *
2676  * Caller must have the socket locked if the event is attached to the socket.
2677  */
2678 static void
2679 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2680         isc_task_t *task;
2681
2682         task = (*dev)->ev_sender;
2683
2684         (*dev)->ev_sender = sock;
2685
2686         if (ISC_LINK_LINKED(*dev, ev_link))
2687                 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2688
2689         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
2690             == ISC_SOCKEVENTATTR_ATTACHED)
2691                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
2692         else
2693                 isc_task_send(task, (isc_event_t **)dev);
2694 }
2695
2696 /*
2697  * See comments for send_recvdone_event() above.
2698  *
2699  * Caller must have the socket locked if the event is attached to the socket.
2700  */
2701 static void
2702 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2703         isc_task_t *task;
2704
2705         INSIST(dev != NULL && *dev != NULL);
2706
2707         task = (*dev)->ev_sender;
2708         (*dev)->ev_sender = sock;
2709
2710         if (ISC_LINK_LINKED(*dev, ev_link))
2711                 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
2712
2713         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
2714             == ISC_SOCKEVENTATTR_ATTACHED)
2715                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
2716         else
2717                 isc_task_send(task, (isc_event_t **)dev);
2718 }
2719
2720 /*
2721  * Call accept() on a socket, to get the new file descriptor.  The listen
2722  * socket is used as a prototype to create a new isc_socket_t.  The new
2723  * socket has one outstanding reference.  The task receiving the event
2724  * will be detached from just after the event is delivered.
2725  *
2726  * On entry to this function, the event delivered is the internal
2727  * readable event, and the first item on the accept_list should be
2728  * the done event we want to send.  If the list is empty, this is a no-op,
2729  * so just unlock and return.
2730  */
2731 static void
2732 internal_accept(isc_task_t *me, isc_event_t *ev) {
2733         isc_socket_t *sock;
2734         isc_socketmgr_t *manager;
2735         isc_socket_newconnev_t *dev;
2736         isc_task_t *task;
2737         ISC_SOCKADDR_LEN_T addrlen;
2738         int fd;
2739         isc_result_t result = ISC_R_SUCCESS;
2740         char strbuf[ISC_STRERRORSIZE];
2741         const char *err = "accept";
2742
2743         UNUSED(me);
2744
2745         sock = ev->ev_sender;
2746         INSIST(VALID_SOCKET(sock));
2747
2748         LOCK(&sock->lock);
2749         socket_log(sock, NULL, TRACE,
2750                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2751                    "internal_accept called, locked socket");
2752
2753         manager = sock->manager;
2754         INSIST(VALID_MANAGER(manager));
2755
2756         INSIST(sock->listener);
2757         INSIST(sock->pending_accept == 1);
2758         sock->pending_accept = 0;
2759
2760         INSIST(sock->references > 0);
2761         sock->references--;  /* the internal event is done with this socket */
2762         if (sock->references == 0) {
2763                 UNLOCK(&sock->lock);
2764                 destroy(&sock);
2765                 return;
2766         }
2767
2768         /*
2769          * Get the first item off the accept list.
2770          * If it is empty, unlock the socket and return.
2771          */
2772         dev = ISC_LIST_HEAD(sock->accept_list);
2773         if (dev == NULL) {
2774                 UNLOCK(&sock->lock);
2775                 return;
2776         }
2777
2778         /*
2779          * Try to accept the new connection.  If the accept fails with
2780          * EAGAIN or EINTR, simply poke the watcher to watch this socket
2781          * again.  Also ignore ECONNRESET, which has been reported to
2782          * be spuriously returned on Linux 2.2.19 although it is not
2783          * a documented error for accept().  ECONNABORTED has been
2784          * reported for Solaris 8.  The rest are thrown in not because
2785          * we have seen them but because they are ignored by other
2786          * daemons such as BIND 8 and Apache.
2787          */
2788
2789         addrlen = sizeof(dev->newsocket->peer_address.type);
2790         memset(&dev->newsocket->peer_address.type, 0, addrlen);
2791         fd = accept(sock->fd, &dev->newsocket->peer_address.type.sa,
2792                     (void *)&addrlen);
2793
2794 #ifdef F_DUPFD
2795         /*
2796          * Leave a space for stdio to work in.
2797          */
2798         if (fd >= 0 && fd < 20) {
2799                 int new, tmp;
2800                 new = fcntl(fd, F_DUPFD, 20);
2801                 tmp = errno;
2802                 (void)close(fd);
2803                 errno = tmp;
2804                 fd = new;
2805                 err = "accept/fcntl";
2806         }
2807 #endif
2808
2809         if (fd < 0) {
2810                 if (SOFT_ERROR(errno))
2811                         goto soft_error;
2812                 switch (errno) {
2813                 case ENFILE:
2814                 case EMFILE:
2815                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2816                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2817                                        isc_msgcat, ISC_MSGSET_SOCKET,
2818                                        ISC_MSG_TOOMANYFDS,
2819                                        "%s: too many open file descriptors",
2820                                        err);
2821                         goto soft_error;
2822
2823                 case ENOBUFS:
2824                 case ENOMEM:
2825                 case ECONNRESET:
2826                 case ECONNABORTED:
2827                 case EHOSTUNREACH:
2828                 case EHOSTDOWN:
2829                 case ENETUNREACH:
2830                 case ENETDOWN:
2831                 case ECONNREFUSED:
2832 #ifdef EPROTO
2833                 case EPROTO:
2834 #endif
2835 #ifdef ENONET
2836                 case ENONET:
2837 #endif
2838                         goto soft_error;
2839                 default:
2840                         break;
2841                 }
2842                 isc__strerror(errno, strbuf, sizeof(strbuf));
2843                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2844                                  "internal_accept: %s() %s: %s", err,
2845                                  isc_msgcat_get(isc_msgcat,
2846                                                 ISC_MSGSET_GENERAL,
2847                                                 ISC_MSG_FAILED,
2848                                                 "failed"),
2849                                  strbuf);
2850                 fd = -1;
2851                 result = ISC_R_UNEXPECTED;
2852         } else {
2853                 if (addrlen == 0U) {
2854                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2855                                          "internal_accept(): "
2856                                          "accept() failed to return "
2857                                          "remote address");
2858
2859                         (void)close(fd);
2860                         goto soft_error;
2861                 } else if (dev->newsocket->peer_address.type.sa.sa_family !=
2862                            sock->pf)
2863                 {
2864                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2865                                          "internal_accept(): "
2866                                          "accept() returned peer address "
2867                                          "family %u (expected %u)",
2868                                          dev->newsocket->peer_address.
2869                                          type.sa.sa_family,
2870                                          sock->pf);
2871                         (void)close(fd);
2872                         goto soft_error;
2873                 } else if (fd >= (int)manager->maxsocks) {
2874                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2875                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2876                                        isc_msgcat, ISC_MSGSET_SOCKET,
2877                                        ISC_MSG_TOOMANYFDS,
2878                                        "accept: "
2879                                        "file descriptor exceeds limit (%d/%u)",
2880                                        fd, manager->maxsocks);
2881                         (void)close(fd);
2882                         goto soft_error;
2883                 }
2884         }
2885
2886         if (fd != -1) {
2887                 dev->newsocket->peer_address.length = addrlen;
2888                 dev->newsocket->pf = sock->pf;
2889         }
2890
2891         /*
2892          * Pull off the done event.
2893          */
2894         ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2895
2896         /*
2897          * Poke watcher if there are more pending accepts.
2898          */
2899         if (!ISC_LIST_EMPTY(sock->accept_list))
2900                 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2901
2902         UNLOCK(&sock->lock);
2903
2904         if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) {
2905                 (void)close(fd);
2906                 fd = -1;
2907                 result = ISC_R_UNEXPECTED;
2908         }
2909
2910         /*
2911          * -1 means the new socket didn't happen.
2912          */
2913         if (fd != -1) {
2914                 int lockid = FDLOCK_ID(fd);
2915
2916                 LOCK(&manager->fdlock[lockid]);
2917                 manager->fds[fd] = dev->newsocket;
2918                 manager->fdstate[fd] = MANAGED;
2919                 UNLOCK(&manager->fdlock[lockid]);
2920
2921                 LOCK(&manager->lock);
2922                 ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
2923
2924                 dev->newsocket->fd = fd;
2925                 dev->newsocket->bound = 1;
2926                 dev->newsocket->connected = 1;
2927
2928                 /*
2929                  * Save away the remote address
2930                  */
2931                 dev->address = dev->newsocket->peer_address;
2932
2933 #ifdef USE_SELECT
2934                 if (manager->maxfd < fd)
2935                         manager->maxfd = fd;
2936 #endif
2937
2938                 socket_log(sock, &dev->newsocket->peer_address, CREATION,
2939                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2940                            "accepted connection, new socket %p",
2941                            dev->newsocket);
2942
2943                 UNLOCK(&manager->lock);
2944
2945                 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
2946         } else {
2947                 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
2948                 dev->newsocket->references--;
2949                 free_socket(&dev->newsocket);
2950         }
2951
2952         /*
2953          * Fill in the done event details and send it off.
2954          */
2955         dev->result = result;
2956         task = dev->ev_sender;
2957         dev->ev_sender = sock;
2958
2959         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2960         return;
2961
2962  soft_error:
2963         select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2964         UNLOCK(&sock->lock);
2965
2966         inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
2967         return;
2968 }
2969
2970 static void
2971 internal_recv(isc_task_t *me, isc_event_t *ev) {
2972         isc_socketevent_t *dev;
2973         isc_socket_t *sock;
2974
2975         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
2976
2977         sock = ev->ev_sender;
2978         INSIST(VALID_SOCKET(sock));
2979
2980         LOCK(&sock->lock);
2981         socket_log(sock, NULL, IOEVENT,
2982                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2983                    "internal_recv: task %p got event %p", me, ev);
2984
2985         INSIST(sock->pending_recv == 1);
2986         sock->pending_recv = 0;
2987
2988         INSIST(sock->references > 0);
2989         sock->references--;  /* the internal event is done with this socket */
2990         if (sock->references == 0) {
2991                 UNLOCK(&sock->lock);
2992                 destroy(&sock);
2993                 return;
2994         }
2995
2996         /*
2997          * Try to do as much I/O as possible on this socket.  There are no
2998          * limits here, currently.
2999          */
3000         dev = ISC_LIST_HEAD(sock->recv_list);
3001         while (dev != NULL) {
3002                 switch (doio_recv(sock, dev)) {
3003                 case DOIO_SOFT:
3004                         goto poke;
3005
3006                 case DOIO_EOF:
3007                         /*
3008                          * read of 0 means the remote end was closed.
3009                          * Run through the event queue and dispatch all
3010                          * the events with an EOF result code.
3011                          */
3012                         do {
3013                                 dev->result = ISC_R_EOF;
3014                                 send_recvdone_event(sock, &dev);
3015                                 dev = ISC_LIST_HEAD(sock->recv_list);
3016                         } while (dev != NULL);
3017                         goto poke;
3018
3019                 case DOIO_SUCCESS:
3020                 case DOIO_HARD:
3021                         send_recvdone_event(sock, &dev);
3022                         break;
3023                 }
3024
3025                 dev = ISC_LIST_HEAD(sock->recv_list);
3026         }
3027
3028  poke:
3029         if (!ISC_LIST_EMPTY(sock->recv_list))
3030                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3031
3032         UNLOCK(&sock->lock);
3033 }
3034
3035 static void
3036 internal_send(isc_task_t *me, isc_event_t *ev) {
3037         isc_socketevent_t *dev;
3038         isc_socket_t *sock;
3039
3040         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3041
3042         /*
3043          * Find out what socket this is and lock it.
3044          */
3045         sock = (isc_socket_t *)ev->ev_sender;
3046         INSIST(VALID_SOCKET(sock));
3047
3048         LOCK(&sock->lock);
3049         socket_log(sock, NULL, IOEVENT,
3050                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3051                    "internal_send: task %p got event %p", me, ev);
3052
3053         INSIST(sock->pending_send == 1);
3054         sock->pending_send = 0;
3055
3056         INSIST(sock->references > 0);
3057         sock->references--;  /* the internal event is done with this socket */
3058         if (sock->references == 0) {
3059                 UNLOCK(&sock->lock);
3060                 destroy(&sock);
3061                 return;
3062         }
3063
3064         /*
3065          * Try to do as much I/O as possible on this socket.  There are no
3066          * limits here, currently.
3067          */
3068         dev = ISC_LIST_HEAD(sock->send_list);
3069         while (dev != NULL) {
3070                 switch (doio_send(sock, dev)) {
3071                 case DOIO_SOFT:
3072                         goto poke;
3073
3074                 case DOIO_HARD:
3075                 case DOIO_SUCCESS:
3076                         send_senddone_event(sock, &dev);
3077                         break;
3078                 }
3079
3080                 dev = ISC_LIST_HEAD(sock->send_list);
3081         }
3082
3083  poke:
3084         if (!ISC_LIST_EMPTY(sock->send_list))
3085                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3086
3087         UNLOCK(&sock->lock);
3088 }
3089
3090 static void
3091 internal_fdwatch_write(isc_task_t *me, isc_event_t *ev) {
3092         isc_socket_t *sock;
3093         int more_data;
3094
3095         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3096
3097         /*
3098          * Find out what socket this is and lock it.
3099          */
3100         sock = (isc_socket_t *)ev->ev_sender;
3101         INSIST(VALID_SOCKET(sock));
3102
3103         LOCK(&sock->lock);
3104         socket_log(sock, NULL, IOEVENT,
3105                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3106                    "internal_fdwatch_write: task %p got event %p", me, ev);
3107
3108         INSIST(sock->pending_send == 1);
3109
3110         UNLOCK(&sock->lock);
3111         more_data = (sock->fdwatchcb)(me, sock, sock->fdwatcharg);
3112         LOCK(&sock->lock);
3113
3114         sock->pending_send = 0;
3115
3116         INSIST(sock->references > 0);
3117         sock->references--;  /* the internal event is done with this socket */
3118         if (sock->references == 0) {
3119                 UNLOCK(&sock->lock);
3120                 destroy(&sock);
3121                 return;
3122         }
3123
3124         if (more_data)
3125                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3126
3127         UNLOCK(&sock->lock);
3128 }
3129
3130 static void
3131 internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) {
3132         isc_socket_t *sock;
3133         int more_data;
3134
3135         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3136
3137         /*
3138          * Find out what socket this is and lock it.
3139          */
3140         sock = (isc_socket_t *)ev->ev_sender;
3141         INSIST(VALID_SOCKET(sock));
3142
3143         LOCK(&sock->lock);
3144         socket_log(sock, NULL, IOEVENT,
3145                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3146                    "internal_fdwatch_read: task %p got event %p", me, ev);
3147
3148         INSIST(sock->pending_recv == 1);
3149
3150         UNLOCK(&sock->lock);
3151         more_data = (sock->fdwatchcb)(me, sock, sock->fdwatcharg);
3152         LOCK(&sock->lock);
3153
3154         sock->pending_recv = 0;
3155
3156         INSIST(sock->references > 0);
3157         sock->references--;  /* the internal event is done with this socket */
3158         if (sock->references == 0) {
3159                 UNLOCK(&sock->lock);
3160                 destroy(&sock);
3161                 return;
3162         }
3163
3164         if (more_data)
3165                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3166
3167         UNLOCK(&sock->lock);
3168 }
3169
3170 /*
3171  * Process read/writes on each fd here.  Avoid locking
3172  * and unlocking twice if both reads and writes are possible.
3173  */
3174 static void
3175 process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable,
3176            isc_boolean_t writeable)
3177 {
3178         isc_socket_t *sock;
3179         isc_boolean_t unlock_sock;
3180         isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
3181         int lockid = FDLOCK_ID(fd);
3182
3183         /*
3184          * If the socket is going to be closed, don't do more I/O.
3185          */
3186         LOCK(&manager->fdlock[lockid]);
3187         if (manager->fdstate[fd] == CLOSE_PENDING) {
3188                 UNLOCK(&manager->fdlock[lockid]);
3189
3190                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3191                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3192                 return;
3193         }
3194
3195         sock = manager->fds[fd];
3196         unlock_sock = ISC_FALSE;
3197         if (readable) {
3198                 if (sock == NULL) {
3199                         unwatch_read = ISC_TRUE;
3200                         goto check_write;
3201                 }
3202                 unlock_sock = ISC_TRUE;
3203                 LOCK(&sock->lock);
3204                 if (!SOCK_DEAD(sock)) {
3205                         if (sock->listener)
3206                                 dispatch_accept(sock);
3207                         else
3208                                 dispatch_recv(sock);
3209                 }
3210                 unwatch_read = ISC_TRUE;
3211         }
3212 check_write:
3213         if (writeable) {
3214                 if (sock == NULL) {
3215                         unwatch_write = ISC_TRUE;
3216                         goto unlock_fd;
3217                 }
3218                 if (!unlock_sock) {
3219                         unlock_sock = ISC_TRUE;
3220                         LOCK(&sock->lock);
3221                 }
3222                 if (!SOCK_DEAD(sock)) {
3223                         if (sock->connecting)
3224                                 dispatch_connect(sock);
3225                         else
3226                                 dispatch_send(sock);
3227                 }
3228                 unwatch_write = ISC_TRUE;
3229         }
3230         if (unlock_sock)
3231                 UNLOCK(&sock->lock);
3232
3233  unlock_fd:
3234         UNLOCK(&manager->fdlock[lockid]);
3235         if (unwatch_read)
3236                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3237         if (unwatch_write)
3238                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3239
3240 }
3241
3242 #ifdef USE_KQUEUE
3243 static isc_boolean_t
3244 process_fds(isc_socketmgr_t *manager, struct kevent *events, int nevents) {
3245         int i;
3246         isc_boolean_t readable, writable;
3247         isc_boolean_t done = ISC_FALSE;
3248 #ifdef ISC_PLATFORM_USETHREADS
3249         isc_boolean_t have_ctlevent = ISC_FALSE;
3250 #endif
3251
3252         if (nevents == manager->nevents) {
3253                 /*
3254                  * This is not an error, but something unexpected.  If this
3255                  * happens, it may indicate the need for increasing
3256                  * ISC_SOCKET_MAXEVENTS.
3257                  */
3258                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3259                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3260                             "maximum number of FD events (%d) received",
3261                             nevents);
3262         }
3263
3264         for (i = 0; i < nevents; i++) {
3265                 REQUIRE(events[i].ident < manager->maxsocks);
3266 #ifdef ISC_PLATFORM_USETHREADS
3267                 if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) {
3268                         have_ctlevent = ISC_TRUE;
3269                         continue;
3270                 }
3271 #endif
3272                 readable = ISC_TF(events[i].filter == EVFILT_READ);
3273                 writable = ISC_TF(events[i].filter == EVFILT_WRITE);
3274                 process_fd(manager, events[i].ident, readable, writable);
3275         }
3276
3277 #ifdef ISC_PLATFORM_USETHREADS
3278         if (have_ctlevent)
3279                 done = process_ctlfd(manager);
3280 #endif
3281
3282         return (done);
3283 }
3284 #elif defined(USE_EPOLL)
3285 static isc_boolean_t
3286 process_fds(isc_socketmgr_t *manager, struct epoll_event *events, int nevents) {
3287         int i;
3288         isc_boolean_t done = ISC_FALSE;
3289 #ifdef ISC_PLATFORM_USETHREADS
3290         isc_boolean_t have_ctlevent = ISC_FALSE;
3291 #endif
3292
3293         if (nevents == manager->nevents) {
3294                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3295                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3296                             "maximum number of FD events (%d) received",
3297                             nevents);
3298         }
3299
3300         for (i = 0; i < nevents; i++) {
3301                 REQUIRE(events[i].data.fd < (int)manager->maxsocks);
3302 #ifdef ISC_PLATFORM_USETHREADS
3303                 if (events[i].data.fd == manager->pipe_fds[0]) {
3304                         have_ctlevent = ISC_TRUE;
3305                         continue;
3306                 }
3307 #endif
3308                 if ((events[i].events & EPOLLERR) != 0 ||
3309                     (events[i].events & EPOLLHUP) != 0) {
3310                         /*
3311                          * epoll does not set IN/OUT bits on an erroneous
3312                          * condition, so we need to try both anyway.  This is a
3313                          * bit inefficient, but should be okay for such rare
3314                          * events.  Note also that the read or write attempt
3315                          * won't block because we use non-blocking sockets.
3316                          */
3317                         events[i].events |= (EPOLLIN | EPOLLOUT);
3318                 }
3319                 process_fd(manager, events[i].data.fd,
3320                            (events[i].events & EPOLLIN) != 0,
3321                            (events[i].events & EPOLLOUT) != 0);
3322         }
3323
3324 #ifdef ISC_PLATFORM_USETHREADS
3325         if (have_ctlevent)
3326                 done = process_ctlfd(manager);
3327 #endif
3328
3329         return (done);
3330 }
3331 #elif defined(USE_DEVPOLL)
3332 static isc_boolean_t
3333 process_fds(isc_socketmgr_t *manager, struct pollfd *events, int nevents) {
3334         int i;
3335         isc_boolean_t done = ISC_FALSE;
3336 #ifdef ISC_PLATFORM_USETHREADS
3337         isc_boolean_t have_ctlevent = ISC_FALSE;
3338 #endif
3339
3340         if (nevents == manager->nevents) {
3341                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3342                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3343                             "maximum number of FD events (%d) received",
3344                             nevents);
3345         }
3346
3347         for (i = 0; i < nevents; i++) {
3348                 REQUIRE(events[i].fd < (int)manager->maxsocks);
3349 #ifdef ISC_PLATFORM_USETHREADS
3350                 if (events[i].fd == manager->pipe_fds[0]) {
3351                         have_ctlevent = ISC_TRUE;
3352                         continue;
3353                 }
3354 #endif
3355                 process_fd(manager, events[i].fd,
3356                            (events[i].events & POLLIN) != 0,
3357                            (events[i].events & POLLOUT) != 0);
3358         }
3359
3360 #ifdef ISC_PLATFORM_USETHREADS
3361         if (have_ctlevent)
3362                 done = process_ctlfd(manager);
3363 #endif
3364
3365         return (done);
3366 }
3367 #elif defined(USE_SELECT)
3368 static void
3369 process_fds(isc_socketmgr_t *manager, int maxfd,
3370             fd_set *readfds, fd_set *writefds)
3371 {
3372         int i;
3373
3374         REQUIRE(maxfd <= (int)manager->maxsocks);
3375
3376         for (i = 0; i < maxfd; i++) {
3377 #ifdef ISC_PLATFORM_USETHREADS
3378                 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
3379                         continue;
3380 #endif /* ISC_PLATFORM_USETHREADS */
3381                 process_fd(manager, i, FD_ISSET(i, readfds),
3382                            FD_ISSET(i, writefds));
3383         }
3384 }
3385 #endif
3386
3387 #ifdef ISC_PLATFORM_USETHREADS
3388 static isc_boolean_t
3389 process_ctlfd(isc_socketmgr_t *manager) {
3390         int msg, fd;
3391
3392         for (;;) {
3393                 select_readmsg(manager, &fd, &msg);
3394
3395                 manager_log(manager, IOEVENT,
3396                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
3397                                            ISC_MSG_WATCHERMSG,
3398                                            "watcher got message %d "
3399                                            "for socket %d"), msg, fd);
3400
3401                 /*
3402                  * Nothing to read?
3403                  */
3404                 if (msg == SELECT_POKE_NOTHING)
3405                         break;
3406
3407                 /*
3408                  * Handle shutdown message.  We really should
3409                  * jump out of this loop right away, but
3410                  * it doesn't matter if we have to do a little
3411                  * more work first.
3412                  */
3413                 if (msg == SELECT_POKE_SHUTDOWN)
3414                         return (ISC_TRUE);
3415
3416                 /*
3417                  * This is a wakeup on a socket.  Look
3418                  * at the event queue for both read and write,
3419                  * and decide if we need to watch on it now
3420                  * or not.
3421                  */
3422                 wakeup_socket(manager, fd, msg);
3423         }
3424
3425         return (ISC_FALSE);
3426 }
3427
3428 /*
3429  * This is the thread that will loop forever, always in a select or poll
3430  * call.
3431  *
3432  * When select returns something to do, track down what thread gets to do
3433  * this I/O and post the event to it.
3434  */
3435 static isc_threadresult_t
3436 watcher(void *uap) {
3437         isc_socketmgr_t *manager = uap;
3438         isc_boolean_t done;
3439         int ctlfd;
3440         int cc;
3441 #ifdef USE_KQUEUE
3442         const char *fnname = "kevent()";
3443 #elif defined (USE_EPOLL)
3444         const char *fnname = "epoll_wait()";
3445 #elif defined(USE_DEVPOLL)
3446         const char *fnname = "ioctl(DP_POLL)";
3447         struct dvpoll dvp;
3448 #elif defined (USE_SELECT)
3449         const char *fnname = "select()";
3450         int maxfd;
3451 #endif
3452         char strbuf[ISC_STRERRORSIZE];
3453 #ifdef ISC_SOCKET_USE_POLLWATCH
3454         pollstate_t pollstate = poll_idle;
3455 #endif
3456
3457         /*
3458          * Get the control fd here.  This will never change.
3459          */
3460         ctlfd = manager->pipe_fds[0];
3461         done = ISC_FALSE;
3462         while (!done) {
3463                 do {
3464 #ifdef USE_KQUEUE
3465                         cc = kevent(manager->kqueue_fd, NULL, 0,
3466                                     manager->events, manager->nevents, NULL);
3467 #elif defined(USE_EPOLL)
3468                         cc = epoll_wait(manager->epoll_fd, manager->events,
3469                                         manager->nevents, -1);
3470 #elif defined(USE_DEVPOLL)
3471                         dvp.dp_fds = manager->events;
3472                         dvp.dp_nfds = manager->nevents;
3473 #ifndef ISC_SOCKET_USE_POLLWATCH
3474                         dvp.dp_timeout = -1;
3475 #else
3476                         if (pollstate == poll_idle)
3477                                 dvp.dp_timeout = -1;
3478                         else
3479                                 dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT;
3480 #endif  /* ISC_SOCKET_USE_POLLWATCH */
3481                         cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
3482 #elif defined(USE_SELECT)
3483                         LOCK(&manager->lock);
3484                         memcpy(manager->read_fds_copy, manager->read_fds,
3485                                manager->fd_bufsize);
3486                         memcpy(manager->write_fds_copy, manager->write_fds,
3487                                manager->fd_bufsize);
3488                         maxfd = manager->maxfd + 1;
3489                         UNLOCK(&manager->lock);
3490
3491                         cc = select(maxfd, manager->read_fds_copy,
3492                                     manager->write_fds_copy, NULL, NULL);
3493 #endif  /* USE_KQUEUE */
3494
3495                         if (cc < 0 && !SOFT_ERROR(errno)) {
3496                                 isc__strerror(errno, strbuf, sizeof(strbuf));
3497                                 FATAL_ERROR(__FILE__, __LINE__,
3498                                             "%s %s: %s", fnname,
3499                                             isc_msgcat_get(isc_msgcat,
3500                                                            ISC_MSGSET_GENERAL,
3501                                                            ISC_MSG_FAILED,
3502                                                            "failed"), strbuf);
3503                         }
3504
3505 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3506                         if (cc == 0) {
3507                                 if (pollstate == poll_active)
3508                                         pollstate = poll_checking;
3509                                 else if (pollstate == poll_checking)
3510                                         pollstate = poll_idle;
3511                         } else if (cc > 0) {
3512                                 if (pollstate == poll_checking) {
3513                                         /*
3514                                          * XXX: We'd like to use a more
3515                                          * verbose log level as it's actually an
3516                                          * unexpected event, but the kernel bug
3517                                          * reportedly happens pretty frequently
3518                                          * (and it can also be a false positive)
3519                                          * so it would be just too noisy.
3520                                          */
3521                                         manager_log(manager,
3522                                                     ISC_LOGCATEGORY_GENERAL,
3523                                                     ISC_LOGMODULE_SOCKET,
3524                                                     ISC_LOG_DEBUG(1),
3525                                                     "unexpected POLL timeout");
3526                                 }
3527                                 pollstate = poll_active;
3528                         }
3529 #endif
3530                 } while (cc < 0);
3531
3532 #if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL)
3533                 done = process_fds(manager, manager->events, cc);
3534 #elif defined(USE_SELECT)
3535                 process_fds(manager, maxfd, manager->read_fds_copy,
3536                             manager->write_fds_copy);
3537
3538                 /*
3539                  * Process reads on internal, control fd.
3540                  */
3541                 if (FD_ISSET(ctlfd, manager->read_fds_copy))
3542                         done = process_ctlfd(manager);
3543 #endif
3544         }
3545
3546         manager_log(manager, TRACE, "%s",
3547                     isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3548                                    ISC_MSG_EXITING, "watcher exiting"));
3549
3550         return ((isc_threadresult_t)0);
3551 }
3552 #endif /* ISC_PLATFORM_USETHREADS */
3553
3554 void
3555 isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
3556
3557         REQUIRE(VALID_MANAGER(manager));
3558
3559         manager->reserved = reserved;
3560 }
3561
3562 /*
3563  * Create a new socket manager.
3564  */
3565
3566 static isc_result_t
3567 setup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
3568         isc_result_t result;
3569 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3570         char strbuf[ISC_STRERRORSIZE];
3571 #endif
3572
3573 #ifdef USE_KQUEUE
3574         manager->nevents = ISC_SOCKET_MAXEVENTS;
3575         manager->events = isc_mem_get(mctx, sizeof(struct kevent) *
3576                                       manager->nevents);
3577         if (manager->events == NULL)
3578                 return (ISC_R_NOMEMORY);
3579         manager->kqueue_fd = kqueue();
3580         if (manager->kqueue_fd == -1) {
3581                 result = isc__errno2result(errno);
3582                 isc__strerror(errno, strbuf, sizeof(strbuf));
3583                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3584                                  "kqueue %s: %s",
3585                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3586                                                 ISC_MSG_FAILED, "failed"),
3587                                  strbuf);
3588                 isc_mem_put(mctx, manager->events,
3589                             sizeof(struct kevent) * manager->nevents);
3590                 return (result);
3591         }
3592
3593 #ifdef ISC_PLATFORM_USETHREADS
3594         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3595         if (result != ISC_R_SUCCESS) {
3596                 close(manager->kqueue_fd);
3597                 isc_mem_put(mctx, manager->events,
3598                             sizeof(struct kevent) * manager->nevents);
3599                 return (result);
3600         }
3601 #endif  /* ISC_PLATFORM_USETHREADS */
3602 #elif defined(USE_EPOLL)
3603         manager->nevents = ISC_SOCKET_MAXEVENTS;
3604         manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) *
3605                                       manager->nevents);
3606         if (manager->events == NULL)
3607                 return (ISC_R_NOMEMORY);
3608         manager->epoll_fd = epoll_create(manager->nevents);
3609         if (manager->epoll_fd == -1) {
3610                 result = isc__errno2result(errno);
3611                 isc__strerror(errno, strbuf, sizeof(strbuf));
3612                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3613                                  "epoll_create %s: %s",
3614                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3615                                                 ISC_MSG_FAILED, "failed"),
3616                                  strbuf);
3617                 isc_mem_put(mctx, manager->events,
3618                             sizeof(struct epoll_event) * manager->nevents);
3619                 return (result);
3620         }
3621 #ifdef ISC_PLATFORM_USETHREADS
3622         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3623         if (result != ISC_R_SUCCESS) {
3624                 close(manager->epoll_fd);
3625                 isc_mem_put(mctx, manager->events,
3626                             sizeof(struct epoll_event) * manager->nevents);
3627                 return (result);
3628         }
3629 #endif  /* ISC_PLATFORM_USETHREADS */
3630 #elif defined(USE_DEVPOLL)
3631         /*
3632          * XXXJT: /dev/poll seems to reject large numbers of events,
3633          * so we should be careful about redefining ISC_SOCKET_MAXEVENTS.
3634          */
3635         manager->nevents = ISC_SOCKET_MAXEVENTS;
3636         manager->events = isc_mem_get(mctx, sizeof(struct pollfd) *
3637                                       manager->nevents);
3638         if (manager->events == NULL)
3639                 return (ISC_R_NOMEMORY);
3640         /*
3641          * Note: fdpollinfo should be able to support all possible FDs, so
3642          * it must have maxsocks entries (not nevents).
3643          */
3644         manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) *
3645                                           manager->maxsocks);
3646         if (manager->fdpollinfo == NULL) {
3647                 isc_mem_put(mctx, manager->events,
3648                             sizeof(pollinfo_t) * manager->maxsocks);
3649                 return (ISC_R_NOMEMORY);
3650         }
3651         memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks);
3652         manager->devpoll_fd = open("/dev/poll", O_RDWR);
3653         if (manager->devpoll_fd == -1) {
3654                 result = isc__errno2result(errno);
3655                 isc__strerror(errno, strbuf, sizeof(strbuf));
3656                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3657                                  "open(/dev/poll) %s: %s",
3658                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3659                                                 ISC_MSG_FAILED, "failed"),
3660                                  strbuf);
3661                 isc_mem_put(mctx, manager->events,
3662                             sizeof(struct pollfd) * manager->nevents);
3663                 isc_mem_put(mctx, manager->fdpollinfo,
3664                             sizeof(pollinfo_t) * manager->maxsocks);
3665                 return (result);
3666         }
3667 #ifdef ISC_PLATFORM_USETHREADS
3668         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3669         if (result != ISC_R_SUCCESS) {
3670                 close(manager->devpoll_fd);
3671                 isc_mem_put(mctx, manager->events,
3672                             sizeof(struct pollfd) * manager->nevents);
3673                 isc_mem_put(mctx, manager->fdpollinfo,
3674                             sizeof(pollinfo_t) * manager->maxsocks);
3675                 return (result);
3676         }
3677 #endif  /* ISC_PLATFORM_USETHREADS */
3678 #elif defined(USE_SELECT)
3679         UNUSED(result);
3680
3681 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3682         /*
3683          * Note: this code should also cover the case of MAXSOCKETS <=
3684          * FD_SETSIZE, but we separate the cases to avoid possible portability
3685          * issues regarding howmany() and the actual representation of fd_set.
3686          */
3687         manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
3688                 sizeof(fd_mask);
3689 #else
3690         manager->fd_bufsize = sizeof(fd_set);
3691 #endif
3692
3693         manager->read_fds = NULL;
3694         manager->read_fds_copy = NULL;
3695         manager->write_fds = NULL;
3696         manager->write_fds_copy = NULL;
3697
3698         manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
3699         if (manager->read_fds != NULL)
3700                 manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
3701         if (manager->read_fds_copy != NULL)
3702                 manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
3703         if (manager->write_fds != NULL) {
3704                 manager->write_fds_copy = isc_mem_get(mctx,
3705                                                       manager->fd_bufsize);
3706         }
3707         if (manager->write_fds_copy == NULL) {
3708                 if (manager->write_fds != NULL) {
3709                         isc_mem_put(mctx, manager->write_fds,
3710                                     manager->fd_bufsize);
3711                 }
3712                 if (manager->read_fds_copy != NULL) {
3713                         isc_mem_put(mctx, manager->read_fds_copy,
3714                                     manager->fd_bufsize);
3715                 }
3716                 if (manager->read_fds != NULL) {
3717                         isc_mem_put(mctx, manager->read_fds,
3718                                     manager->fd_bufsize);
3719                 }
3720                 return (ISC_R_NOMEMORY);
3721         }
3722         memset(manager->read_fds, 0, manager->fd_bufsize);
3723         memset(manager->write_fds, 0, manager->fd_bufsize);
3724
3725 #ifdef ISC_PLATFORM_USETHREADS
3726         (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3727         manager->maxfd = manager->pipe_fds[0];
3728 #else /* ISC_PLATFORM_USETHREADS */
3729         manager->maxfd = 0;
3730 #endif /* ISC_PLATFORM_USETHREADS */
3731 #endif  /* USE_KQUEUE */
3732
3733         return (ISC_R_SUCCESS);
3734 }
3735
3736 static void
3737 cleanup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
3738 #ifdef ISC_PLATFORM_USETHREADS
3739         isc_result_t result;
3740
3741         result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3742         if (result != ISC_R_SUCCESS) {
3743                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3744                                  "epoll_ctl(DEL) %s",
3745                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3746                                                 ISC_MSG_FAILED, "failed"));
3747         }
3748 #endif  /* ISC_PLATFORM_USETHREADS */
3749
3750 #ifdef USE_KQUEUE
3751         close(manager->kqueue_fd);
3752         isc_mem_put(mctx, manager->events,
3753                     sizeof(struct kevent) * manager->nevents);
3754 #elif defined(USE_EPOLL)
3755         close(manager->epoll_fd);
3756         isc_mem_put(mctx, manager->events,
3757                     sizeof(struct epoll_event) * manager->nevents);
3758 #elif defined(USE_DEVPOLL)
3759         close(manager->devpoll_fd);
3760         isc_mem_put(mctx, manager->events,
3761                     sizeof(struct pollfd) * manager->nevents);
3762         isc_mem_put(mctx, manager->fdpollinfo,
3763                     sizeof(pollinfo_t) * manager->maxsocks);
3764 #elif defined(USE_SELECT)
3765         if (manager->read_fds != NULL)
3766                 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
3767         if (manager->read_fds_copy != NULL)
3768                 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize);
3769         if (manager->write_fds != NULL)
3770                 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
3771         if (manager->write_fds_copy != NULL)
3772                 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
3773 #endif  /* USE_KQUEUE */
3774 }
3775
3776 isc_result_t
3777 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
3778         return (isc_socketmgr_create2(mctx, managerp, 0));
3779 }
3780
3781 isc_result_t
3782 isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
3783                       unsigned int maxsocks)
3784 {
3785         int i;
3786         isc_socketmgr_t *manager;
3787 #ifdef ISC_PLATFORM_USETHREADS
3788         char strbuf[ISC_STRERRORSIZE];
3789 #endif
3790         isc_result_t result;
3791
3792         REQUIRE(managerp != NULL && *managerp == NULL);
3793
3794 #ifndef ISC_PLATFORM_USETHREADS
3795         if (socketmgr != NULL) {
3796                 /* Don't allow maxsocks to be updated */
3797                 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
3798                         return (ISC_R_EXISTS);
3799
3800                 socketmgr->refs++;
3801                 *managerp = socketmgr;
3802                 return (ISC_R_SUCCESS);
3803         }
3804 #endif /* ISC_PLATFORM_USETHREADS */
3805
3806         if (maxsocks == 0)
3807                 maxsocks = ISC_SOCKET_MAXSOCKETS;
3808
3809         manager = isc_mem_get(mctx, sizeof(*manager));
3810         if (manager == NULL)
3811                 return (ISC_R_NOMEMORY);
3812
3813         /* zero-clear so that necessary cleanup on failure will be easy */
3814         memset(manager, 0, sizeof(*manager));
3815         manager->maxsocks = maxsocks;
3816         manager->reserved = 0;
3817         manager->fds = isc_mem_get(mctx,
3818                                    manager->maxsocks * sizeof(isc_socket_t *));
3819         if (manager->fds == NULL) {
3820                 result = ISC_R_NOMEMORY;
3821                 goto free_manager;
3822         }
3823         manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int));
3824         if (manager->fdstate == NULL) {
3825                 result = ISC_R_NOMEMORY;
3826                 goto free_manager;
3827         }
3828         manager->stats = NULL;
3829
3830         manager->magic = SOCKET_MANAGER_MAGIC;
3831         manager->mctx = NULL;
3832         memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
3833         ISC_LIST_INIT(manager->socklist);
3834         result = isc_mutex_init(&manager->lock);
3835         if (result != ISC_R_SUCCESS)
3836                 goto free_manager;
3837         manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t));
3838         if (manager->fdlock == NULL) {
3839                 result = ISC_R_NOMEMORY;
3840                 goto cleanup_lock;
3841         }
3842         for (i = 0; i < FDLOCK_COUNT; i++) {
3843                 result = isc_mutex_init(&manager->fdlock[i]);
3844                 if (result != ISC_R_SUCCESS) {
3845                         while (--i >= 0)
3846                                 DESTROYLOCK(&manager->fdlock[i]);
3847                         isc_mem_put(mctx, manager->fdlock,
3848                                     FDLOCK_COUNT * sizeof(isc_mutex_t));
3849                         manager->fdlock = NULL;
3850                         goto cleanup_lock;
3851                 }
3852         }
3853
3854 #ifdef ISC_PLATFORM_USETHREADS
3855         if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
3856                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3857                                  "isc_condition_init() %s",
3858                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3859                                                 ISC_MSG_FAILED, "failed"));
3860                 result = ISC_R_UNEXPECTED;
3861                 goto cleanup_lock;
3862         }
3863
3864         /*
3865          * Create the special fds that will be used to wake up the
3866          * select/poll loop when something internal needs to be done.
3867          */
3868         if (pipe(manager->pipe_fds) != 0) {
3869                 isc__strerror(errno, strbuf, sizeof(strbuf));
3870                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3871                                  "pipe() %s: %s",
3872                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3873                                                 ISC_MSG_FAILED, "failed"),
3874                                  strbuf);
3875                 result = ISC_R_UNEXPECTED;
3876                 goto cleanup_condition;
3877         }
3878
3879         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
3880 #if 0
3881         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
3882 #endif
3883 #else /* ISC_PLATFORM_USETHREADS */
3884         manager->refs = 1;
3885 #endif /* ISC_PLATFORM_USETHREADS */
3886
3887         /*
3888          * Set up initial state for the select loop
3889          */
3890         result = setup_watcher(mctx, manager);
3891         if (result != ISC_R_SUCCESS)
3892                 goto cleanup;
3893         memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
3894 #ifdef ISC_PLATFORM_USETHREADS
3895         /*
3896          * Start up the select/poll thread.
3897          */
3898         if (isc_thread_create(watcher, manager, &manager->watcher) !=
3899             ISC_R_SUCCESS) {
3900                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3901                                  "isc_thread_create() %s",
3902                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3903                                                 ISC_MSG_FAILED, "failed"));
3904                 cleanup_watcher(mctx, manager);
3905                 result = ISC_R_UNEXPECTED;
3906                 goto cleanup;
3907         }
3908 #endif /* ISC_PLATFORM_USETHREADS */
3909         isc_mem_attach(mctx, &manager->mctx);
3910
3911 #ifndef ISC_PLATFORM_USETHREADS
3912         socketmgr = manager;
3913 #endif /* ISC_PLATFORM_USETHREADS */
3914         *managerp = manager;
3915
3916         return (ISC_R_SUCCESS);
3917
3918 cleanup:
3919 #ifdef ISC_PLATFORM_USETHREADS
3920         (void)close(manager->pipe_fds[0]);
3921         (void)close(manager->pipe_fds[1]);
3922 #endif  /* ISC_PLATFORM_USETHREADS */
3923
3924 #ifdef ISC_PLATFORM_USETHREADS
3925 cleanup_condition:
3926         (void)isc_condition_destroy(&manager->shutdown_ok);
3927 #endif  /* ISC_PLATFORM_USETHREADS */
3928
3929
3930 cleanup_lock:
3931         if (manager->fdlock != NULL) {
3932                 for (i = 0; i < FDLOCK_COUNT; i++)
3933                         DESTROYLOCK(&manager->fdlock[i]);
3934         }
3935         DESTROYLOCK(&manager->lock);
3936
3937 free_manager:
3938         if (manager->fdlock != NULL) {
3939                 isc_mem_put(mctx, manager->fdlock,
3940                             FDLOCK_COUNT * sizeof(isc_mutex_t));
3941         }
3942         if (manager->fdstate != NULL) {
3943                 isc_mem_put(mctx, manager->fdstate,
3944                             manager->maxsocks * sizeof(int));
3945         }
3946         if (manager->fds != NULL) {
3947                 isc_mem_put(mctx, manager->fds,
3948                             manager->maxsocks * sizeof(isc_socket_t *));
3949         }
3950         isc_mem_put(mctx, manager, sizeof(*manager));
3951
3952         return (result);
3953 }
3954
3955 isc_result_t
3956 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
3957         REQUIRE(VALID_MANAGER(manager));
3958         REQUIRE(nsockp != NULL);
3959
3960         *nsockp = manager->maxsocks;
3961
3962         return (ISC_R_SUCCESS);
3963 }
3964
3965 void
3966 isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
3967         REQUIRE(VALID_MANAGER(manager));
3968         REQUIRE(ISC_LIST_EMPTY(manager->socklist));
3969         REQUIRE(manager->stats == NULL);
3970         REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
3971
3972         isc_stats_attach(stats, &manager->stats);
3973 }
3974
3975 void
3976 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
3977         isc_socketmgr_t *manager;
3978         int i;
3979         isc_mem_t *mctx;
3980
3981         /*
3982          * Destroy a socket manager.
3983          */
3984
3985         REQUIRE(managerp != NULL);
3986         manager = *managerp;
3987         REQUIRE(VALID_MANAGER(manager));
3988
3989 #ifndef ISC_PLATFORM_USETHREADS
3990         if (manager->refs > 1) {
3991                 manager->refs--;
3992                 *managerp = NULL;
3993                 return;
3994         }
3995 #endif /* ISC_PLATFORM_USETHREADS */
3996
3997         LOCK(&manager->lock);
3998
3999 #ifdef ISC_PLATFORM_USETHREADS
4000         /*
4001          * Wait for all sockets to be destroyed.
4002          */
4003         while (!ISC_LIST_EMPTY(manager->socklist)) {
4004                 manager_log(manager, CREATION, "%s",
4005                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
4006                                            ISC_MSG_SOCKETSREMAIN,
4007                                            "sockets exist"));
4008                 WAIT(&manager->shutdown_ok, &manager->lock);
4009         }
4010 #else /* ISC_PLATFORM_USETHREADS */
4011         /*
4012          * Hope all sockets have been destroyed.
4013          */
4014         if (!ISC_LIST_EMPTY(manager->socklist)) {
4015                 manager_log(manager, CREATION, "%s",
4016                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
4017                                            ISC_MSG_SOCKETSREMAIN,
4018                                            "sockets exist"));
4019                 INSIST(0);
4020         }
4021 #endif /* ISC_PLATFORM_USETHREADS */
4022
4023         UNLOCK(&manager->lock);
4024
4025         /*
4026          * Here, poke our select/poll thread.  Do this by closing the write
4027          * half of the pipe, which will send EOF to the read half.
4028          * This is currently a no-op in the non-threaded case.
4029          */
4030         select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
4031
4032 #ifdef ISC_PLATFORM_USETHREADS
4033         /*
4034          * Wait for thread to exit.
4035          */
4036         if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
4037                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4038                                  "isc_thread_join() %s",
4039                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4040                                                 ISC_MSG_FAILED, "failed"));
4041 #endif /* ISC_PLATFORM_USETHREADS */
4042
4043         /*
4044          * Clean up.
4045          */
4046         cleanup_watcher(manager->mctx, manager);
4047
4048 #ifdef ISC_PLATFORM_USETHREADS
4049         (void)close(manager->pipe_fds[0]);
4050         (void)close(manager->pipe_fds[1]);
4051         (void)isc_condition_destroy(&manager->shutdown_ok);
4052 #endif /* ISC_PLATFORM_USETHREADS */
4053
4054         for (i = 0; i < (int)manager->maxsocks; i++)
4055                 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
4056                         (void)close(i);
4057
4058         isc_mem_put(manager->mctx, manager->fds,
4059                     manager->maxsocks * sizeof(isc_socket_t *));
4060         isc_mem_put(manager->mctx, manager->fdstate,
4061                     manager->maxsocks * sizeof(int));
4062
4063         if (manager->stats != NULL)
4064                 isc_stats_detach(&manager->stats);
4065
4066         if (manager->fdlock != NULL) {
4067                 for (i = 0; i < FDLOCK_COUNT; i++)
4068                         DESTROYLOCK(&manager->fdlock[i]);
4069                 isc_mem_put(manager->mctx, manager->fdlock,
4070                             FDLOCK_COUNT * sizeof(isc_mutex_t));
4071         }
4072         DESTROYLOCK(&manager->lock);
4073         manager->magic = 0;
4074         mctx= manager->mctx;
4075         isc_mem_put(mctx, manager, sizeof(*manager));
4076
4077         isc_mem_detach(&mctx);
4078
4079         *managerp = NULL;
4080 }
4081
4082 static isc_result_t
4083 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4084             unsigned int flags)
4085 {
4086         int io_state;
4087         isc_boolean_t have_lock = ISC_FALSE;
4088         isc_task_t *ntask = NULL;
4089         isc_result_t result = ISC_R_SUCCESS;
4090
4091         dev->ev_sender = task;
4092
4093         if (sock->type == isc_sockettype_udp) {
4094                 io_state = doio_recv(sock, dev);
4095         } else {
4096                 LOCK(&sock->lock);
4097                 have_lock = ISC_TRUE;
4098
4099                 if (ISC_LIST_EMPTY(sock->recv_list))
4100                         io_state = doio_recv(sock, dev);
4101                 else
4102                         io_state = DOIO_SOFT;
4103         }
4104
4105         switch (io_state) {
4106         case DOIO_SOFT:
4107                 /*
4108                  * We couldn't read all or part of the request right now, so
4109                  * queue it.
4110                  *
4111                  * Attach to socket and to task
4112                  */
4113                 isc_task_attach(task, &ntask);
4114                 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4115
4116                 if (!have_lock) {
4117                         LOCK(&sock->lock);
4118                         have_lock = ISC_TRUE;
4119                 }
4120
4121                 /*
4122                  * Enqueue the request.  If the socket was previously not being
4123                  * watched, poke the watcher to start paying attention to it.
4124                  */
4125                 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
4126                         select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
4127                 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
4128
4129                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
4130                            "socket_recv: event %p -> task %p",
4131                            dev, ntask);
4132
4133                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4134                         result = ISC_R_INPROGRESS;
4135                 break;
4136
4137         case DOIO_EOF:
4138                 dev->result = ISC_R_EOF;
4139                 /* fallthrough */
4140
4141         case DOIO_HARD:
4142         case DOIO_SUCCESS:
4143                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4144                         send_recvdone_event(sock, &dev);
4145                 break;
4146         }
4147
4148         if (have_lock)
4149                 UNLOCK(&sock->lock);
4150
4151         return (result);
4152 }
4153
4154 isc_result_t
4155 isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
4156                  unsigned int minimum, isc_task_t *task,
4157                  isc_taskaction_t action, const void *arg)
4158 {
4159         isc_socketevent_t *dev;
4160         isc_socketmgr_t *manager;
4161         unsigned int iocount;
4162         isc_buffer_t *buffer;
4163
4164         REQUIRE(VALID_SOCKET(sock));
4165         REQUIRE(buflist != NULL);
4166         REQUIRE(!ISC_LIST_EMPTY(*buflist));
4167         REQUIRE(task != NULL);
4168         REQUIRE(action != NULL);
4169
4170         manager = sock->manager;
4171         REQUIRE(VALID_MANAGER(manager));
4172
4173         iocount = isc_bufferlist_availablecount(buflist);
4174         REQUIRE(iocount > 0);
4175
4176         INSIST(sock->bound);
4177
4178         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4179         if (dev == NULL) {
4180                 return (ISC_R_NOMEMORY);
4181         }
4182
4183         /*
4184          * UDP sockets are always partial read
4185          */
4186         if (sock->type == isc_sockettype_udp)
4187                 dev->minimum = 1;
4188         else {
4189                 if (minimum == 0)
4190                         dev->minimum = iocount;
4191                 else
4192                         dev->minimum = minimum;
4193         }
4194
4195         /*
4196          * Move each buffer from the passed in list to our internal one.
4197          */
4198         buffer = ISC_LIST_HEAD(*buflist);
4199         while (buffer != NULL) {
4200                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
4201                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4202                 buffer = ISC_LIST_HEAD(*buflist);
4203         }
4204
4205         return (socket_recv(sock, dev, task, 0));
4206 }
4207
4208 isc_result_t
4209 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
4210                 isc_task_t *task, isc_taskaction_t action, const void *arg)
4211 {
4212         isc_socketevent_t *dev;
4213         isc_socketmgr_t *manager;
4214
4215         REQUIRE(VALID_SOCKET(sock));
4216         REQUIRE(action != NULL);
4217
4218         manager = sock->manager;
4219         REQUIRE(VALID_MANAGER(manager));
4220
4221         INSIST(sock->bound);
4222
4223         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4224         if (dev == NULL)
4225                 return (ISC_R_NOMEMORY);
4226
4227         return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
4228 }
4229
4230 isc_result_t
4231 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
4232                  unsigned int minimum, isc_task_t *task,
4233                  isc_socketevent_t *event, unsigned int flags)
4234 {
4235         event->ev_sender = sock;
4236         event->result = ISC_R_UNEXPECTED;
4237         ISC_LIST_INIT(event->bufferlist);
4238         event->region = *region;
4239         event->n = 0;
4240         event->offset = 0;
4241         event->attributes = 0;
4242
4243         /*
4244          * UDP sockets are always partial read.
4245          */
4246         if (sock->type == isc_sockettype_udp)
4247                 event->minimum = 1;
4248         else {
4249                 if (minimum == 0)
4250                         event->minimum = region->length;
4251                 else
4252                         event->minimum = minimum;
4253         }
4254
4255         return (socket_recv(sock, event, task, flags));
4256 }
4257
4258 static isc_result_t
4259 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4260             isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4261             unsigned int flags)
4262 {
4263         int io_state;
4264         isc_boolean_t have_lock = ISC_FALSE;
4265         isc_task_t *ntask = NULL;
4266         isc_result_t result = ISC_R_SUCCESS;
4267
4268         dev->ev_sender = task;
4269
4270         set_dev_address(address, sock, dev);
4271         if (pktinfo != NULL) {
4272                 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4273                 dev->pktinfo = *pktinfo;
4274
4275                 if (!isc_sockaddr_issitelocal(&dev->address) &&
4276                     !isc_sockaddr_islinklocal(&dev->address)) {
4277                         socket_log(sock, NULL, TRACE, isc_msgcat,
4278                                    ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
4279                                    "pktinfo structure provided, ifindex %u "
4280                                    "(set to 0)", pktinfo->ipi6_ifindex);
4281
4282                         /*
4283                          * Set the pktinfo index to 0 here, to let the
4284                          * kernel decide what interface it should send on.
4285                          */
4286                         dev->pktinfo.ipi6_ifindex = 0;
4287                 }
4288         }
4289
4290         if (sock->type == isc_sockettype_udp)
4291                 io_state = doio_send(sock, dev);
4292         else {
4293                 LOCK(&sock->lock);
4294                 have_lock = ISC_TRUE;
4295
4296                 if (ISC_LIST_EMPTY(sock->send_list))
4297                         io_state = doio_send(sock, dev);
4298                 else
4299                         io_state = DOIO_SOFT;
4300         }
4301
4302         switch (io_state) {
4303         case DOIO_SOFT:
4304                 /*
4305                  * We couldn't send all or part of the request right now, so
4306                  * queue it unless ISC_SOCKFLAG_NORETRY is set.
4307                  */
4308                 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4309                         isc_task_attach(task, &ntask);
4310                         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4311
4312                         if (!have_lock) {
4313                                 LOCK(&sock->lock);
4314                                 have_lock = ISC_TRUE;
4315                         }
4316
4317                         /*
4318                          * Enqueue the request.  If the socket was previously
4319                          * not being watched, poke the watcher to start
4320                          * paying attention to it.
4321                          */
4322                         if (ISC_LIST_EMPTY(sock->send_list) &&
4323                             !sock->pending_send)
4324                                 select_poke(sock->manager, sock->fd,
4325                                             SELECT_POKE_WRITE);
4326                         ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4327
4328                         socket_log(sock, NULL, EVENT, NULL, 0, 0,
4329                                    "socket_send: event %p -> task %p",
4330                                    dev, ntask);
4331
4332                         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4333                                 result = ISC_R_INPROGRESS;
4334                         break;
4335                 }
4336
4337         case DOIO_HARD:
4338         case DOIO_SUCCESS:
4339                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4340                         send_senddone_event(sock, &dev);
4341                 break;
4342         }
4343
4344         if (have_lock)
4345                 UNLOCK(&sock->lock);
4346
4347         return (result);
4348 }
4349
4350 isc_result_t
4351 isc_socket_send(isc_socket_t *sock, isc_region_t *region,
4352                 isc_task_t *task, isc_taskaction_t action, const void *arg)
4353 {
4354         /*
4355          * REQUIRE() checking is performed in isc_socket_sendto().
4356          */
4357         return (isc_socket_sendto(sock, region, task, action, arg, NULL,
4358                                   NULL));
4359 }
4360
4361 isc_result_t
4362 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
4363                   isc_task_t *task, isc_taskaction_t action, const void *arg,
4364                   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4365 {
4366         isc_socketevent_t *dev;
4367         isc_socketmgr_t *manager;
4368
4369         REQUIRE(VALID_SOCKET(sock));
4370         REQUIRE(region != NULL);
4371         REQUIRE(task != NULL);
4372         REQUIRE(action != NULL);
4373
4374         manager = sock->manager;
4375         REQUIRE(VALID_MANAGER(manager));
4376
4377         INSIST(sock->bound);
4378
4379         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4380         if (dev == NULL) {
4381                 return (ISC_R_NOMEMORY);
4382         }
4383
4384         dev->region = *region;
4385
4386         return (socket_send(sock, dev, task, address, pktinfo, 0));
4387 }
4388
4389 isc_result_t
4390 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
4391                  isc_task_t *task, isc_taskaction_t action, const void *arg)
4392 {
4393         return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
4394                                    NULL));
4395 }
4396
4397 isc_result_t
4398 isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
4399                    isc_task_t *task, isc_taskaction_t action, const void *arg,
4400                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4401 {
4402         isc_socketevent_t *dev;
4403         isc_socketmgr_t *manager;
4404         unsigned int iocount;
4405         isc_buffer_t *buffer;
4406
4407         REQUIRE(VALID_SOCKET(sock));
4408         REQUIRE(buflist != NULL);
4409         REQUIRE(!ISC_LIST_EMPTY(*buflist));
4410         REQUIRE(task != NULL);
4411         REQUIRE(action != NULL);
4412
4413         manager = sock->manager;
4414         REQUIRE(VALID_MANAGER(manager));
4415
4416         iocount = isc_bufferlist_usedcount(buflist);
4417         REQUIRE(iocount > 0);
4418
4419         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4420         if (dev == NULL) {
4421                 return (ISC_R_NOMEMORY);
4422         }
4423
4424         /*
4425          * Move each buffer from the passed in list to our internal one.
4426          */
4427         buffer = ISC_LIST_HEAD(*buflist);
4428         while (buffer != NULL) {
4429                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
4430                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4431                 buffer = ISC_LIST_HEAD(*buflist);
4432         }
4433
4434         return (socket_send(sock, dev, task, address, pktinfo, 0));
4435 }
4436
4437 isc_result_t
4438 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
4439                    isc_task_t *task,
4440                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4441                    isc_socketevent_t *event, unsigned int flags)
4442 {
4443         REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
4444         if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
4445                 REQUIRE(sock->type == isc_sockettype_udp);
4446         event->ev_sender = sock;
4447         event->result = ISC_R_UNEXPECTED;
4448         ISC_LIST_INIT(event->bufferlist);
4449         event->region = *region;
4450         event->n = 0;
4451         event->offset = 0;
4452         event->attributes = 0;
4453
4454         return (socket_send(sock, event, task, address, pktinfo, flags));
4455 }
4456
4457 void
4458 isc_socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
4459 #ifdef ISC_PLATFORM_HAVESYSUNH
4460         int s;
4461         struct stat sb;
4462         char strbuf[ISC_STRERRORSIZE];
4463
4464         if (sockaddr->type.sa.sa_family != AF_UNIX)
4465                 return;
4466
4467 #ifndef S_ISSOCK
4468 #if defined(S_IFMT) && defined(S_IFSOCK)
4469 #define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
4470 #elif defined(_S_IFMT) && defined(S_IFSOCK)
4471 #define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
4472 #endif
4473 #endif
4474
4475 #ifndef S_ISFIFO
4476 #if defined(S_IFMT) && defined(S_IFIFO)
4477 #define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
4478 #elif defined(_S_IFMT) && defined(S_IFIFO)
4479 #define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
4480 #endif
4481 #endif
4482
4483 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4484 #error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4485 #endif
4486
4487 #ifndef S_ISFIFO
4488 #define S_ISFIFO(mode) 0
4489 #endif
4490
4491 #ifndef S_ISSOCK
4492 #define S_ISSOCK(mode) 0
4493 #endif
4494
4495         if (active) {
4496                 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4497                         isc__strerror(errno, strbuf, sizeof(strbuf));
4498                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4499                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4500                                       "isc_socket_cleanunix: stat(%s): %s",
4501                                       sockaddr->type.sunix.sun_path, strbuf);
4502                         return;
4503                 }
4504                 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4505                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4506                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4507                                       "isc_socket_cleanunix: %s: not a socket",
4508                                       sockaddr->type.sunix.sun_path);
4509                         return;
4510                 }
4511                 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4512                         isc__strerror(errno, strbuf, sizeof(strbuf));
4513                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4514                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4515                                       "isc_socket_cleanunix: unlink(%s): %s",
4516                                       sockaddr->type.sunix.sun_path, strbuf);
4517                 }
4518                 return;
4519         }
4520
4521         s = socket(AF_UNIX, SOCK_STREAM, 0);
4522         if (s < 0) {
4523                 isc__strerror(errno, strbuf, sizeof(strbuf));
4524                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4525                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4526                               "isc_socket_cleanunix: socket(%s): %s",
4527                               sockaddr->type.sunix.sun_path, strbuf);
4528                 return;
4529         }
4530
4531         if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4532                 switch (errno) {
4533                 case ENOENT:    /* We exited cleanly last time */
4534                         break;
4535                 default:
4536                         isc__strerror(errno, strbuf, sizeof(strbuf));
4537                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4538                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4539                                       "isc_socket_cleanunix: stat(%s): %s",
4540                                       sockaddr->type.sunix.sun_path, strbuf);
4541                         break;
4542                 }
4543                 goto cleanup;
4544         }
4545
4546         if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4547                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4548                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4549                               "isc_socket_cleanunix: %s: not a socket",
4550                               sockaddr->type.sunix.sun_path);
4551                 goto cleanup;
4552         }
4553
4554         if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
4555                     sizeof(sockaddr->type.sunix)) < 0) {
4556                 switch (errno) {
4557                 case ECONNREFUSED:
4558                 case ECONNRESET:
4559                         if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4560                                 isc__strerror(errno, strbuf, sizeof(strbuf));
4561                                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4562                                               ISC_LOGMODULE_SOCKET,
4563                                               ISC_LOG_WARNING,
4564                                               "isc_socket_cleanunix: "
4565                                               "unlink(%s): %s",
4566                                               sockaddr->type.sunix.sun_path,
4567                                               strbuf);
4568                         }
4569                         break;
4570                 default:
4571                         isc__strerror(errno, strbuf, sizeof(strbuf));
4572                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4573                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4574                                       "isc_socket_cleanunix: connect(%s): %s",
4575                                       sockaddr->type.sunix.sun_path, strbuf);
4576                         break;
4577                 }
4578         }
4579  cleanup:
4580         close(s);
4581 #else
4582         UNUSED(sockaddr);
4583         UNUSED(active);
4584 #endif
4585 }
4586
4587 isc_result_t
4588 isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
4589                     isc_uint32_t owner, isc_uint32_t group)
4590 {
4591 #ifdef ISC_PLATFORM_HAVESYSUNH
4592         isc_result_t result = ISC_R_SUCCESS;
4593         char strbuf[ISC_STRERRORSIZE];
4594         char path[sizeof(sockaddr->type.sunix.sun_path)];
4595 #ifdef NEED_SECURE_DIRECTORY
4596         char *slash;
4597 #endif
4598
4599         REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4600         INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4601         strcpy(path, sockaddr->type.sunix.sun_path);
4602
4603 #ifdef NEED_SECURE_DIRECTORY
4604         slash = strrchr(path, '/');
4605         if (slash != NULL) {
4606                 if (slash != path)
4607                         *slash = '\0';
4608                 else
4609                         strcpy(path, "/");
4610         } else
4611                 strcpy(path, ".");
4612 #endif
4613
4614         if (chmod(path, perm) < 0) {
4615                 isc__strerror(errno, strbuf, sizeof(strbuf));
4616                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4617                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4618                               "isc_socket_permunix: chmod(%s, %d): %s",
4619                               path, perm, strbuf);
4620                 result = ISC_R_FAILURE;
4621         }
4622         if (chown(path, owner, group) < 0) {
4623                 isc__strerror(errno, strbuf, sizeof(strbuf));
4624                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4625                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4626                               "isc_socket_permunix: chown(%s, %d, %d): %s",
4627                               path, owner, group,
4628                               strbuf);
4629                 result = ISC_R_FAILURE;
4630         }
4631         return (result);
4632 #else
4633         UNUSED(sockaddr);
4634         UNUSED(perm);
4635         UNUSED(owner);
4636         UNUSED(group);
4637         return (ISC_R_NOTIMPLEMENTED);
4638 #endif
4639 }
4640
4641 isc_result_t
4642 isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
4643                 unsigned int options) {
4644         char strbuf[ISC_STRERRORSIZE];
4645         int on = 1;
4646
4647         LOCK(&sock->lock);
4648
4649         INSIST(!sock->bound);
4650
4651         if (sock->pf != sockaddr->type.sa.sa_family) {
4652                 UNLOCK(&sock->lock);
4653                 return (ISC_R_FAMILYMISMATCH);
4654         }
4655         /*
4656          * Only set SO_REUSEADDR when we want a specific port.
4657          */
4658 #ifdef AF_UNIX
4659         if (sock->pf == AF_UNIX)
4660                 goto bind_socket;
4661 #endif
4662         if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4663             isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
4664             setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4665                        sizeof(on)) < 0) {
4666                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4667                                  "setsockopt(%d) %s", sock->fd,
4668                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4669                                                 ISC_MSG_FAILED, "failed"));
4670                 /* Press on... */
4671         }
4672 #ifdef AF_UNIX
4673  bind_socket:
4674 #endif
4675         if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
4676                 inc_stats(sock->manager->stats,
4677                           sock->statsindex[STATID_BINDFAIL]);
4678
4679                 UNLOCK(&sock->lock);
4680                 switch (errno) {
4681                 case EACCES:
4682                         return (ISC_R_NOPERM);
4683                 case EADDRNOTAVAIL:
4684                         return (ISC_R_ADDRNOTAVAIL);
4685                 case EADDRINUSE:
4686                         return (ISC_R_ADDRINUSE);
4687                 case EINVAL:
4688                         return (ISC_R_BOUND);
4689                 default:
4690                         isc__strerror(errno, strbuf, sizeof(strbuf));
4691                         UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
4692                                          strbuf);
4693                         return (ISC_R_UNEXPECTED);
4694                 }
4695         }
4696
4697         socket_log(sock, sockaddr, TRACE,
4698                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
4699         sock->bound = 1;
4700
4701         UNLOCK(&sock->lock);
4702         return (ISC_R_SUCCESS);
4703 }
4704
4705 isc_result_t
4706 isc_socket_filter(isc_socket_t *sock, const char *filter) {
4707 #ifdef SO_ACCEPTFILTER
4708         char strbuf[ISC_STRERRORSIZE];
4709         struct accept_filter_arg afa;
4710 #else
4711         UNUSED(sock);
4712         UNUSED(filter);
4713 #endif
4714
4715         REQUIRE(VALID_SOCKET(sock));
4716
4717 #ifdef SO_ACCEPTFILTER
4718         bzero(&afa, sizeof(afa));
4719         strncpy(afa.af_name, filter, sizeof(afa.af_name));
4720         if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
4721                          &afa, sizeof(afa)) == -1) {
4722                 isc__strerror(errno, strbuf, sizeof(strbuf));
4723                 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
4724                            ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
4725                            strbuf);
4726                 return (ISC_R_FAILURE);
4727         }
4728         return (ISC_R_SUCCESS);
4729 #else
4730         return (ISC_R_NOTIMPLEMENTED);
4731 #endif
4732 }
4733
4734 /*
4735  * Set up to listen on a given socket.  We do this by creating an internal
4736  * event that will be dispatched when the socket has read activity.  The
4737  * watcher will send the internal event to the task when there is a new
4738  * connection.
4739  *
4740  * Unlike in read, we don't preallocate a done event here.  Every time there
4741  * is a new connection we'll have to allocate a new one anyway, so we might
4742  * as well keep things simple rather than having to track them.
4743  */
4744 isc_result_t
4745 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
4746         char strbuf[ISC_STRERRORSIZE];
4747
4748         REQUIRE(VALID_SOCKET(sock));
4749
4750         LOCK(&sock->lock);
4751
4752         REQUIRE(!sock->listener);
4753         REQUIRE(sock->bound);
4754         REQUIRE(sock->type == isc_sockettype_tcp ||
4755                 sock->type == isc_sockettype_unix);
4756
4757         if (backlog == 0)
4758                 backlog = SOMAXCONN;
4759
4760         if (listen(sock->fd, (int)backlog) < 0) {
4761                 UNLOCK(&sock->lock);
4762                 isc__strerror(errno, strbuf, sizeof(strbuf));
4763
4764                 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
4765
4766                 return (ISC_R_UNEXPECTED);
4767         }
4768
4769         sock->listener = 1;
4770
4771         UNLOCK(&sock->lock);
4772         return (ISC_R_SUCCESS);
4773 }
4774
4775 /*
4776  * This should try to do aggressive accept() XXXMLG
4777  */
4778 isc_result_t
4779 isc_socket_accept(isc_socket_t *sock,
4780                   isc_task_t *task, isc_taskaction_t action, const void *arg)
4781 {
4782         isc_socket_newconnev_t *dev;
4783         isc_socketmgr_t *manager;
4784         isc_task_t *ntask = NULL;
4785         isc_socket_t *nsock;
4786         isc_result_t result;
4787         isc_boolean_t do_poke = ISC_FALSE;
4788
4789         REQUIRE(VALID_SOCKET(sock));
4790         manager = sock->manager;
4791         REQUIRE(VALID_MANAGER(manager));
4792
4793         LOCK(&sock->lock);
4794
4795         REQUIRE(sock->listener);
4796
4797         /*
4798          * Sender field is overloaded here with the task we will be sending
4799          * this event to.  Just before the actual event is delivered the
4800          * actual ev_sender will be touched up to be the socket.
4801          */
4802         dev = (isc_socket_newconnev_t *)
4803                 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
4804                                    action, arg, sizeof(*dev));
4805         if (dev == NULL) {
4806                 UNLOCK(&sock->lock);
4807                 return (ISC_R_NOMEMORY);
4808         }
4809         ISC_LINK_INIT(dev, ev_link);
4810
4811         result = allocate_socket(manager, sock->type, &nsock);
4812         if (result != ISC_R_SUCCESS) {
4813                 isc_event_free(ISC_EVENT_PTR(&dev));
4814                 UNLOCK(&sock->lock);
4815                 return (result);
4816         }
4817
4818         /*
4819          * Attach to socket and to task.
4820          */
4821         isc_task_attach(task, &ntask);
4822         nsock->references++;
4823         nsock->statsindex = sock->statsindex;
4824
4825         dev->ev_sender = ntask;
4826         dev->newsocket = nsock;
4827
4828         /*
4829          * Poke watcher here.  We still have the socket locked, so there
4830          * is no race condition.  We will keep the lock for such a short
4831          * bit of time waking it up now or later won't matter all that much.
4832          */
4833         if (ISC_LIST_EMPTY(sock->accept_list))
4834                 do_poke = ISC_TRUE;
4835
4836         ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
4837
4838         if (do_poke)
4839                 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
4840
4841         UNLOCK(&sock->lock);
4842         return (ISC_R_SUCCESS);
4843 }
4844
4845 isc_result_t
4846 isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
4847                    isc_task_t *task, isc_taskaction_t action, const void *arg)
4848 {
4849         isc_socket_connev_t *dev;
4850         isc_task_t *ntask = NULL;
4851         isc_socketmgr_t *manager;
4852         int cc;
4853         char strbuf[ISC_STRERRORSIZE];
4854
4855         REQUIRE(VALID_SOCKET(sock));
4856         REQUIRE(addr != NULL);
4857         REQUIRE(task != NULL);
4858         REQUIRE(action != NULL);
4859
4860         manager = sock->manager;
4861         REQUIRE(VALID_MANAGER(manager));
4862         REQUIRE(addr != NULL);
4863
4864         if (isc_sockaddr_ismulticast(addr))
4865                 return (ISC_R_MULTICAST);
4866
4867         LOCK(&sock->lock);
4868
4869         REQUIRE(!sock->connecting);
4870
4871         dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
4872                                                         ISC_SOCKEVENT_CONNECT,
4873                                                         action, arg,
4874                                                         sizeof(*dev));
4875         if (dev == NULL) {
4876                 UNLOCK(&sock->lock);
4877                 return (ISC_R_NOMEMORY);
4878         }
4879         ISC_LINK_INIT(dev, ev_link);
4880
4881         /*
4882          * Try to do the connect right away, as there can be only one
4883          * outstanding, and it might happen to complete.
4884          */
4885         sock->peer_address = *addr;
4886         cc = connect(sock->fd, &addr->type.sa, addr->length);
4887         if (cc < 0) {
4888                 /*
4889                  * HP-UX "fails" to connect a UDP socket and sets errno to
4890                  * EINPROGRESS if it's non-blocking.  We'd rather regard this as
4891                  * a success and let the user detect it if it's really an error
4892                  * at the time of sending a packet on the socket.
4893                  */
4894                 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
4895                         cc = 0;
4896                         goto success;
4897                 }
4898                 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
4899                         goto queue;
4900
4901                 switch (errno) {
4902 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
4903                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
4904                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4905                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4906                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4907                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4908 #ifdef EHOSTDOWN
4909                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4910 #endif
4911                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4912                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4913                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4914                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4915                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4916 #undef ERROR_MATCH
4917                 }
4918
4919                 sock->connected = 0;
4920
4921                 isc__strerror(errno, strbuf, sizeof(strbuf));
4922                 UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", errno, strbuf);
4923
4924                 UNLOCK(&sock->lock);
4925                 inc_stats(sock->manager->stats,
4926                           sock->statsindex[STATID_CONNECTFAIL]);
4927                 isc_event_free(ISC_EVENT_PTR(&dev));
4928                 return (ISC_R_UNEXPECTED);
4929
4930         err_exit:
4931                 sock->connected = 0;
4932                 isc_task_send(task, ISC_EVENT_PTR(&dev));
4933
4934                 UNLOCK(&sock->lock);
4935                 inc_stats(sock->manager->stats,
4936                           sock->statsindex[STATID_CONNECTFAIL]);
4937                 return (ISC_R_SUCCESS);
4938         }
4939
4940         /*
4941          * If connect completed, fire off the done event.
4942          */
4943  success:
4944         if (cc == 0) {
4945                 sock->connected = 1;
4946                 sock->bound = 1;
4947                 dev->result = ISC_R_SUCCESS;
4948                 isc_task_send(task, ISC_EVENT_PTR(&dev));
4949
4950                 UNLOCK(&sock->lock);
4951
4952                 inc_stats(sock->manager->stats,
4953                           sock->statsindex[STATID_CONNECT]);
4954
4955                 return (ISC_R_SUCCESS);
4956         }
4957
4958  queue:
4959
4960         /*
4961          * Attach to task.
4962          */
4963         isc_task_attach(task, &ntask);
4964
4965         sock->connecting = 1;
4966
4967         dev->ev_sender = ntask;
4968
4969         /*
4970          * Poke watcher here.  We still have the socket locked, so there
4971          * is no race condition.  We will keep the lock for such a short
4972          * bit of time waking it up now or later won't matter all that much.
4973          */
4974         if (sock->connect_ev == NULL)
4975                 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
4976
4977         sock->connect_ev = dev;
4978
4979         UNLOCK(&sock->lock);
4980         return (ISC_R_SUCCESS);
4981 }
4982
4983 /*
4984  * Called when a socket with a pending connect() finishes.
4985  */
4986 static void
4987 internal_connect(isc_task_t *me, isc_event_t *ev) {
4988         isc_socket_t *sock;
4989         isc_socket_connev_t *dev;
4990         isc_task_t *task;
4991         int cc;
4992         ISC_SOCKADDR_LEN_T optlen;
4993         char strbuf[ISC_STRERRORSIZE];
4994         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
4995
4996         UNUSED(me);
4997         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
4998
4999         sock = ev->ev_sender;
5000         INSIST(VALID_SOCKET(sock));
5001
5002         LOCK(&sock->lock);
5003
5004         /*
5005          * When the internal event was sent the reference count was bumped
5006          * to keep the socket around for us.  Decrement the count here.
5007          */
5008         INSIST(sock->references > 0);
5009         sock->references--;
5010         if (sock->references == 0) {
5011                 UNLOCK(&sock->lock);
5012                 destroy(&sock);
5013                 return;
5014         }
5015
5016         /*
5017          * Has this event been canceled?
5018          */
5019         dev = sock->connect_ev;
5020         if (dev == NULL) {
5021                 INSIST(!sock->connecting);
5022                 UNLOCK(&sock->lock);
5023                 return;
5024         }
5025
5026         INSIST(sock->connecting);
5027         sock->connecting = 0;
5028
5029         /*
5030          * Get any possible error status here.
5031          */
5032         optlen = sizeof(cc);
5033         if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
5034                        (void *)&cc, (void *)&optlen) < 0)
5035                 cc = errno;
5036         else
5037                 errno = cc;
5038
5039         if (errno != 0) {
5040                 /*
5041                  * If the error is EAGAIN, just re-select on this
5042                  * fd and pretend nothing strange happened.
5043                  */
5044                 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
5045                         sock->connecting = 1;
5046                         select_poke(sock->manager, sock->fd,
5047                                     SELECT_POKE_CONNECT);
5048                         UNLOCK(&sock->lock);
5049
5050                         return;
5051                 }
5052
5053                 inc_stats(sock->manager->stats,
5054                           sock->statsindex[STATID_CONNECTFAIL]);
5055
5056                 /*
5057                  * Translate other errors into ISC_R_* flavors.
5058                  */
5059                 switch (errno) {
5060 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
5061                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
5062                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
5063                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
5064                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
5065                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
5066 #ifdef EHOSTDOWN
5067                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
5068 #endif
5069                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
5070                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
5071                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
5072                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
5073                         ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
5074                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5075 #undef ERROR_MATCH
5076                 default:
5077                         dev->result = ISC_R_UNEXPECTED;
5078                         isc_sockaddr_format(&sock->peer_address, peerbuf,
5079                                             sizeof(peerbuf));
5080                         isc__strerror(errno, strbuf, sizeof(strbuf));
5081                         UNEXPECTED_ERROR(__FILE__, __LINE__,
5082                                          "internal_connect: connect(%s) %s",
5083                                          peerbuf, strbuf);
5084                 }
5085         } else {
5086                 inc_stats(sock->manager->stats,
5087                           sock->statsindex[STATID_CONNECT]);
5088                 dev->result = ISC_R_SUCCESS;
5089                 sock->connected = 1;
5090                 sock->bound = 1;
5091         }
5092
5093         sock->connect_ev = NULL;
5094
5095         UNLOCK(&sock->lock);
5096
5097         task = dev->ev_sender;
5098         dev->ev_sender = sock;
5099         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
5100 }
5101
5102 isc_result_t
5103 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5104         isc_result_t result;
5105
5106         REQUIRE(VALID_SOCKET(sock));
5107         REQUIRE(addressp != NULL);
5108
5109         LOCK(&sock->lock);
5110
5111         if (sock->connected) {
5112                 *addressp = sock->peer_address;
5113                 result = ISC_R_SUCCESS;
5114         } else {
5115                 result = ISC_R_NOTCONNECTED;
5116         }
5117
5118         UNLOCK(&sock->lock);
5119
5120         return (result);
5121 }
5122
5123 isc_result_t
5124 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5125         ISC_SOCKADDR_LEN_T len;
5126         isc_result_t result;
5127         char strbuf[ISC_STRERRORSIZE];
5128
5129         REQUIRE(VALID_SOCKET(sock));
5130         REQUIRE(addressp != NULL);
5131
5132         LOCK(&sock->lock);
5133
5134         if (!sock->bound) {
5135                 result = ISC_R_NOTBOUND;
5136                 goto out;
5137         }
5138
5139         result = ISC_R_SUCCESS;
5140
5141         len = sizeof(addressp->type);
5142         if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
5143                 isc__strerror(errno, strbuf, sizeof(strbuf));
5144                 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
5145                                  strbuf);
5146                 result = ISC_R_UNEXPECTED;
5147                 goto out;
5148         }
5149         addressp->length = (unsigned int)len;
5150
5151  out:
5152         UNLOCK(&sock->lock);
5153
5154         return (result);
5155 }
5156
5157 /*
5158  * Run through the list of events on this socket, and cancel the ones
5159  * queued for task "task" of type "how".  "how" is a bitmask.
5160  */
5161 void
5162 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
5163
5164         REQUIRE(VALID_SOCKET(sock));
5165
5166         /*
5167          * Quick exit if there is nothing to do.  Don't even bother locking
5168          * in this case.
5169          */
5170         if (how == 0)
5171                 return;
5172
5173         LOCK(&sock->lock);
5174
5175         /*
5176          * All of these do the same thing, more or less.
5177          * Each will:
5178          *      o If the internal event is marked as "posted" try to
5179          *        remove it from the task's queue.  If this fails, mark it
5180          *        as canceled instead, and let the task clean it up later.
5181          *      o For each I/O request for that task of that type, post
5182          *        its done event with status of "ISC_R_CANCELED".
5183          *      o Reset any state needed.
5184          */
5185         if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
5186             && !ISC_LIST_EMPTY(sock->recv_list)) {
5187                 isc_socketevent_t      *dev;
5188                 isc_socketevent_t      *next;
5189                 isc_task_t             *current_task;
5190
5191                 dev = ISC_LIST_HEAD(sock->recv_list);
5192
5193                 while (dev != NULL) {
5194                         current_task = dev->ev_sender;
5195                         next = ISC_LIST_NEXT(dev, ev_link);
5196
5197                         if ((task == NULL) || (task == current_task)) {
5198                                 dev->result = ISC_R_CANCELED;
5199                                 send_recvdone_event(sock, &dev);
5200                         }
5201                         dev = next;
5202                 }
5203         }
5204
5205         if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
5206             && !ISC_LIST_EMPTY(sock->send_list)) {
5207                 isc_socketevent_t      *dev;
5208                 isc_socketevent_t      *next;
5209                 isc_task_t             *current_task;
5210
5211                 dev = ISC_LIST_HEAD(sock->send_list);
5212
5213                 while (dev != NULL) {
5214                         current_task = dev->ev_sender;
5215                         next = ISC_LIST_NEXT(dev, ev_link);
5216
5217                         if ((task == NULL) || (task == current_task)) {
5218                                 dev->result = ISC_R_CANCELED;
5219                                 send_senddone_event(sock, &dev);
5220                         }
5221                         dev = next;
5222                 }
5223         }
5224
5225         if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
5226             && !ISC_LIST_EMPTY(sock->accept_list)) {
5227                 isc_socket_newconnev_t *dev;
5228                 isc_socket_newconnev_t *next;
5229                 isc_task_t             *current_task;
5230
5231                 dev = ISC_LIST_HEAD(sock->accept_list);
5232                 while (dev != NULL) {
5233                         current_task = dev->ev_sender;
5234                         next = ISC_LIST_NEXT(dev, ev_link);
5235
5236                         if ((task == NULL) || (task == current_task)) {
5237
5238                                 ISC_LIST_UNLINK(sock->accept_list, dev,
5239                                                 ev_link);
5240
5241                                 dev->newsocket->references--;
5242                                 free_socket(&dev->newsocket);
5243
5244                                 dev->result = ISC_R_CANCELED;
5245                                 dev->ev_sender = sock;
5246                                 isc_task_sendanddetach(&current_task,
5247                                                        ISC_EVENT_PTR(&dev));
5248                         }
5249
5250                         dev = next;
5251                 }
5252         }
5253
5254         /*
5255          * Connecting is not a list.
5256          */
5257         if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
5258             && sock->connect_ev != NULL) {
5259                 isc_socket_connev_t    *dev;
5260                 isc_task_t             *current_task;
5261
5262                 INSIST(sock->connecting);
5263                 sock->connecting = 0;
5264
5265                 dev = sock->connect_ev;
5266                 current_task = dev->ev_sender;
5267
5268                 if ((task == NULL) || (task == current_task)) {
5269                         sock->connect_ev = NULL;
5270
5271                         dev->result = ISC_R_CANCELED;
5272                         dev->ev_sender = sock;
5273                         isc_task_sendanddetach(&current_task,
5274                                                ISC_EVENT_PTR(&dev));
5275                 }
5276         }
5277
5278         UNLOCK(&sock->lock);
5279 }
5280
5281 isc_sockettype_t
5282 isc_socket_gettype(isc_socket_t *sock) {
5283         REQUIRE(VALID_SOCKET(sock));
5284
5285         return (sock->type);
5286 }
5287
5288 isc_boolean_t
5289 isc_socket_isbound(isc_socket_t *sock) {
5290         isc_boolean_t val;
5291
5292         LOCK(&sock->lock);
5293         val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
5294         UNLOCK(&sock->lock);
5295
5296         return (val);
5297 }
5298
5299 void
5300 isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
5301 #if defined(IPV6_V6ONLY)
5302         int onoff = yes ? 1 : 0;
5303 #else
5304         UNUSED(yes);
5305         UNUSED(sock);
5306 #endif
5307
5308         REQUIRE(VALID_SOCKET(sock));
5309
5310 #ifdef IPV6_V6ONLY
5311         if (sock->pf == AF_INET6) {
5312                 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5313                                (void *)&onoff, sizeof(int)) < 0) {
5314                         char strbuf[ISC_STRERRORSIZE];
5315
5316                         UNEXPECTED_ERROR(__FILE__, __LINE__,
5317                                          "setsockopt(%d, IPV6_V6ONLY) "
5318                                          "%s: %s", sock->fd,
5319                                          isc_msgcat_get(isc_msgcat,
5320                                                         ISC_MSGSET_GENERAL,
5321                                                         ISC_MSG_FAILED,
5322                                                         "failed"),
5323                                          strbuf);
5324                 }
5325         }
5326         FIX_IPV6_RECVPKTINFO(sock);     /* AIX */
5327 #endif
5328 }
5329
5330 #ifndef ISC_PLATFORM_USETHREADS
5331 /* In our assumed scenario, we can simply use a single static object. */
5332 static isc_socketwait_t swait_private;
5333
5334 int
5335 isc__socketmgr_waitevents(struct timeval *tvp, isc_socketwait_t **swaitp) {
5336         int n;
5337 #ifdef USE_KQUEUE
5338         struct timespec ts, *tsp;
5339 #endif
5340 #ifdef USE_EPOLL
5341         int timeout;
5342 #endif
5343 #ifdef USE_DEVPOLL
5344         struct dvpoll dvp;
5345 #endif
5346
5347         REQUIRE(swaitp != NULL && *swaitp == NULL);
5348
5349         if (socketmgr == NULL)
5350                 return (0);
5351
5352 #ifdef USE_KQUEUE
5353         if (tvp != NULL) {
5354                 ts.tv_sec = tvp->tv_sec;
5355                 ts.tv_nsec = tvp->tv_usec * 1000;
5356                 tsp = &ts;
5357         } else
5358                 tsp = NULL;
5359         swait_private.nevents = kevent(socketmgr->kqueue_fd, NULL, 0,
5360                                        socketmgr->events, socketmgr->nevents,
5361                                        tsp);
5362         n = swait_private.nevents;
5363 #elif defined(USE_EPOLL)
5364         if (tvp != NULL)
5365                 timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000;
5366         else
5367                 timeout = -1;
5368         swait_private.nevents = epoll_wait(socketmgr->epoll_fd,
5369                                            socketmgr->events,
5370                                            socketmgr->nevents, timeout);
5371         n = swait_private.nevents;
5372 #elif defined(USE_DEVPOLL)
5373         dvp.dp_fds = socketmgr->events;
5374         dvp.dp_nfds = socketmgr->nevents;
5375         if (tvp != NULL) {
5376                 dvp.dp_timeout = tvp->tv_sec * 1000 +
5377                         (tvp->tv_usec + 999) / 1000;
5378         } else
5379                 dvp.dp_timeout = -1;
5380         swait_private.nevents = ioctl(socketmgr->devpoll_fd, DP_POLL, &dvp);
5381         n = swait_private.nevents;
5382 #elif defined(USE_SELECT)
5383         memcpy(socketmgr->read_fds_copy, socketmgr->read_fds,
5384                socketmgr->fd_bufsize);
5385         memcpy(socketmgr->write_fds_copy, socketmgr->write_fds,
5386                socketmgr->fd_bufsize);
5387
5388         swait_private.readset = socketmgr->read_fds_copy;
5389         swait_private.writeset = socketmgr->write_fds_copy;
5390         swait_private.maxfd = socketmgr->maxfd + 1;
5391
5392         n = select(swait_private.maxfd, swait_private.readset,
5393                    swait_private.writeset, NULL, tvp);
5394 #endif
5395
5396         *swaitp = &swait_private;
5397         return (n);
5398 }
5399
5400 isc_result_t
5401 isc__socketmgr_dispatch(isc_socketwait_t *swait) {
5402         REQUIRE(swait == &swait_private);
5403
5404         if (socketmgr == NULL)
5405                 return (ISC_R_NOTFOUND);
5406
5407 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
5408         (void)process_fds(socketmgr, socketmgr->events, swait->nevents);
5409         return (ISC_R_SUCCESS);
5410 #elif defined(USE_SELECT)
5411         process_fds(socketmgr, swait->maxfd, swait->readset, swait->writeset);
5412         return (ISC_R_SUCCESS);
5413 #endif
5414 }
5415 #endif /* ISC_PLATFORM_USETHREADS */
5416
5417 void
5418 isc_socket_setname(isc_socket_t *socket, const char *name, void *tag) {
5419
5420         /*
5421          * Name 'socket'.
5422          */
5423
5424         REQUIRE(VALID_SOCKET(socket));
5425
5426         LOCK(&socket->lock);
5427         memset(socket->name, 0, sizeof(socket->name));
5428         strncpy(socket->name, name, sizeof(socket->name) - 1);
5429         socket->tag = tag;
5430         UNLOCK(&socket->lock);
5431 }
5432
5433 const char *
5434 isc_socket_getname(isc_socket_t *socket) {
5435         return (socket->name);
5436 }
5437
5438 void *
5439 isc_socket_gettag(isc_socket_t *socket) {
5440         return (socket->tag);
5441 }
5442
5443 #ifdef HAVE_LIBXML2
5444
5445 static const char *
5446 _socktype(isc_sockettype_t type)
5447 {
5448         if (type == isc_sockettype_udp)
5449                 return ("udp");
5450         else if (type == isc_sockettype_tcp)
5451                 return ("tcp");
5452         else if (type == isc_sockettype_unix)
5453                 return ("unix");
5454         else if (type == isc_sockettype_fdwatch)
5455                 return ("fdwatch");
5456         else
5457                 return ("not-initialized");
5458 }
5459
5460 void
5461 isc_socketmgr_renderxml(isc_socketmgr_t *mgr, xmlTextWriterPtr writer)
5462 {
5463         isc_socket_t *sock;
5464         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5465         isc_sockaddr_t addr;
5466         ISC_SOCKADDR_LEN_T len;
5467
5468         LOCK(&mgr->lock);
5469
5470 #ifndef ISC_PLATFORM_USETHREADS
5471         xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5472         xmlTextWriterWriteFormatString(writer, "%d", mgr->refs);
5473         xmlTextWriterEndElement(writer);
5474 #endif
5475
5476         xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets");
5477         sock = ISC_LIST_HEAD(mgr->socklist);
5478         while (sock != NULL) {
5479                 LOCK(&sock->lock);
5480                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket");
5481
5482                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "id");
5483                 xmlTextWriterWriteFormatString(writer, "%p", sock);
5484                 xmlTextWriterEndElement(writer);
5485
5486                 if (sock->name[0] != 0) {
5487                         xmlTextWriterStartElement(writer, ISC_XMLCHAR "name");
5488                         xmlTextWriterWriteFormatString(writer, "%s",
5489                                                        sock->name);
5490                         xmlTextWriterEndElement(writer); /* name */
5491                 }
5492
5493                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5494                 xmlTextWriterWriteFormatString(writer, "%d", sock->references);
5495                 xmlTextWriterEndElement(writer);
5496
5497                 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
5498                                           ISC_XMLCHAR _socktype(sock->type));
5499
5500                 if (sock->connected) {
5501                         isc_sockaddr_format(&sock->peer_address, peerbuf,
5502                                             sizeof(peerbuf));
5503                         xmlTextWriterWriteElement(writer,
5504                                                   ISC_XMLCHAR "peer-address",
5505                                                   ISC_XMLCHAR peerbuf);
5506                 }
5507
5508                 len = sizeof(addr);
5509                 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5510                         isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5511                         xmlTextWriterWriteElement(writer,
5512                                                   ISC_XMLCHAR "local-address",
5513                                                   ISC_XMLCHAR peerbuf);
5514                 }
5515
5516                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "states");
5517                 if (sock->pending_recv)
5518                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5519                                                 ISC_XMLCHAR "pending-receive");
5520                 if (sock->pending_send)
5521                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5522                                                   ISC_XMLCHAR "pending-send");
5523                 if (sock->pending_accept)
5524                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5525                                                  ISC_XMLCHAR "pending_accept");
5526                 if (sock->listener)
5527                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5528                                                   ISC_XMLCHAR "listener");
5529                 if (sock->connected)
5530                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5531                                                   ISC_XMLCHAR "connected");
5532                 if (sock->connecting)
5533                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5534                                                   ISC_XMLCHAR "connecting");
5535                 if (sock->bound)
5536                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5537                                                   ISC_XMLCHAR "bound");
5538
5539                 xmlTextWriterEndElement(writer); /* states */
5540
5541                 xmlTextWriterEndElement(writer); /* socket */
5542
5543                 UNLOCK(&sock->lock);
5544                 sock = ISC_LIST_NEXT(sock, link);
5545         }
5546         xmlTextWriterEndElement(writer); /* sockets */
5547
5548         UNLOCK(&mgr->lock);
5549 }
5550 #endif /* HAVE_LIBXML2 */