]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/bind9/lib/isc/unix/socket.c
This commit was generated by cvs2svn to compensate for changes in r177391,
[FreeBSD/FreeBSD.git] / contrib / bind9 / lib / isc / unix / socket.c
1 /*
2  * Copyright (C) 2004-2007  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1998-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: socket.c,v 1.237.18.29 2007/08/28 07:20:06 tbox Exp $ */
19
20 /*! \file */
21
22 #include <config.h>
23
24 #include <sys/param.h>
25 #include <sys/types.h>
26 #include <sys/socket.h>
27 #include <sys/stat.h>
28 #ifdef ISC_PLATFORM_HAVESYSUNH
29 #include <sys/un.h>
30 #endif
31 #include <sys/time.h>
32 #include <sys/uio.h>
33
34 #include <errno.h>
35 #include <fcntl.h>
36 #include <stddef.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <unistd.h>
40
41 #include <isc/buffer.h>
42 #include <isc/bufferlist.h>
43 #include <isc/condition.h>
44 #include <isc/formatcheck.h>
45 #include <isc/list.h>
46 #include <isc/log.h>
47 #include <isc/mem.h>
48 #include <isc/msgs.h>
49 #include <isc/mutex.h>
50 #include <isc/net.h>
51 #include <isc/once.h>
52 #include <isc/platform.h>
53 #include <isc/print.h>
54 #include <isc/region.h>
55 #include <isc/socket.h>
56 #include <isc/strerror.h>
57 #include <isc/task.h>
58 #include <isc/thread.h>
59 #include <isc/util.h>
60
61 #include "errno2result.h"
62
63 #ifndef ISC_PLATFORM_USETHREADS
64 #include "socket_p.h"
65 #endif /* ISC_PLATFORM_USETHREADS */
66
67 /*%
68  * Some systems define the socket length argument as an int, some as size_t,
69  * some as socklen_t.  This is here so it can be easily changed if needed.
70  */
71 #ifndef ISC_SOCKADDR_LEN_T
72 #define ISC_SOCKADDR_LEN_T unsigned int
73 #endif
74
75
76 #if defined(SO_BSDCOMPAT) && defined(__linux__)
77 #include <sys/utsname.h>
78 #endif
79
80 /*%
81  * Define what the possible "soft" errors can be.  These are non-fatal returns
82  * of various network related functions, like recv() and so on.
83  *
84  * For some reason, BSDI (and perhaps others) will sometimes return <0
85  * from recv() but will have errno==0.  This is broken, but we have to
86  * work around it here.
87  */
88 #define SOFT_ERROR(e)   ((e) == EAGAIN || \
89                          (e) == EWOULDBLOCK || \
90                          (e) == EINTR || \
91                          (e) == 0)
92
93 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
94
95 /*!<
96  * DLVL(90)  --  Function entry/exit and other tracing.
97  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
98  * DLVL(60)  --  Socket data send/receive
99  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
100  * DLVL(20)  --  Socket creation/destruction.
101  */
102 #define TRACE_LEVEL             90
103 #define CORRECTNESS_LEVEL       70
104 #define IOEVENT_LEVEL           60
105 #define EVENT_LEVEL             50
106 #define CREATION_LEVEL          20
107
108 #define TRACE           DLVL(TRACE_LEVEL)
109 #define CORRECTNESS     DLVL(CORRECTNESS_LEVEL)
110 #define IOEVENT         DLVL(IOEVENT_LEVEL)
111 #define EVENT           DLVL(EVENT_LEVEL)
112 #define CREATION        DLVL(CREATION_LEVEL)
113
114 typedef isc_event_t intev_t;
115
116 #define SOCKET_MAGIC            ISC_MAGIC('I', 'O', 'i', 'o')
117 #define VALID_SOCKET(t)         ISC_MAGIC_VALID(t, SOCKET_MAGIC)
118
119 /*!
120  * IPv6 control information.  If the socket is an IPv6 socket we want
121  * to collect the destination address and interface so the client can
122  * set them on outgoing packets.
123  */
124 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
125 #ifndef USE_CMSG
126 #define USE_CMSG        1
127 #endif
128 #endif
129
130 /*%
131  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
132  * a setsockopt() like interface to request timestamps, and if the OS
133  * doesn't do it for us, call gettimeofday() on every UDP receive?
134  */
135 #ifdef SO_TIMESTAMP
136 #ifndef USE_CMSG
137 #define USE_CMSG        1
138 #endif
139 #endif
140
141 /*%
142  * The size to raise the recieve buffer to (from BIND 8).
143  */
144 #define RCVBUFSIZE (32*1024)
145
146 /*%
147  * The number of times a send operation is repeated if the result is EINTR.
148  */
149 #define NRETRIES 10
150
151 struct isc_socket {
152         /* Not locked. */
153         unsigned int            magic;
154         isc_socketmgr_t        *manager;
155         isc_mutex_t             lock;
156         isc_sockettype_t        type;
157
158         /* Locked by socket lock. */
159         ISC_LINK(isc_socket_t)  link;
160         unsigned int            references;
161         int                     fd;
162         int                     pf;
163
164         ISC_LIST(isc_socketevent_t)             send_list;
165         ISC_LIST(isc_socketevent_t)             recv_list;
166         ISC_LIST(isc_socket_newconnev_t)        accept_list;
167         isc_socket_connev_t                    *connect_ev;
168
169         /*
170          * Internal events.  Posted when a descriptor is readable or
171          * writable.  These are statically allocated and never freed.
172          * They will be set to non-purgable before use.
173          */
174         intev_t                 readable_ev;
175         intev_t                 writable_ev;
176
177         isc_sockaddr_t          address;  /* remote address */
178
179         unsigned int            pending_recv : 1,
180                                 pending_send : 1,
181                                 pending_accept : 1,
182                                 listener : 1, /* listener socket */
183                                 connected : 1,
184                                 connecting : 1, /* connect pending */
185                                 bound : 1; /* bound to local addr */
186
187 #ifdef ISC_NET_RECVOVERFLOW
188         unsigned char           overflow; /* used for MSG_TRUNC fake */
189 #endif
190
191         char                    *recvcmsgbuf;
192         ISC_SOCKADDR_LEN_T      recvcmsgbuflen;
193         char                    *sendcmsgbuf;
194         ISC_SOCKADDR_LEN_T      sendcmsgbuflen;
195 };
196
197 #define SOCKET_MANAGER_MAGIC    ISC_MAGIC('I', 'O', 'm', 'g')
198 #define VALID_MANAGER(m)        ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
199
200 struct isc_socketmgr {
201         /* Not locked. */
202         unsigned int            magic;
203         isc_mem_t              *mctx;
204         isc_mutex_t             lock;
205         /* Locked by manager lock. */
206         ISC_LIST(isc_socket_t)  socklist;
207         fd_set                  read_fds;
208         fd_set                  write_fds;
209         isc_socket_t           *fds[FD_SETSIZE];
210         int                     fdstate[FD_SETSIZE];
211         int                     maxfd;
212 #ifdef ISC_PLATFORM_USETHREADS
213         isc_thread_t            watcher;
214         isc_condition_t         shutdown_ok;
215         int                     pipe_fds[2];
216 #else /* ISC_PLATFORM_USETHREADS */
217         unsigned int            refs;
218 #endif /* ISC_PLATFORM_USETHREADS */
219 };
220
221 #ifndef ISC_PLATFORM_USETHREADS
222 static isc_socketmgr_t *socketmgr = NULL;
223 #endif /* ISC_PLATFORM_USETHREADS */
224
225 #define CLOSED          0       /* this one must be zero */
226 #define MANAGED         1
227 #define CLOSE_PENDING   2
228
229 /*
230  * send() and recv() iovec counts
231  */
232 #define MAXSCATTERGATHER_SEND   (ISC_SOCKET_MAXSCATTERGATHER)
233 #ifdef ISC_NET_RECVOVERFLOW
234 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER + 1)
235 #else
236 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER)
237 #endif
238
239 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
240 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
241 static void free_socket(isc_socket_t **);
242 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
243                                     isc_socket_t **);
244 static void destroy(isc_socket_t **);
245 static void internal_accept(isc_task_t *, isc_event_t *);
246 static void internal_connect(isc_task_t *, isc_event_t *);
247 static void internal_recv(isc_task_t *, isc_event_t *);
248 static void internal_send(isc_task_t *, isc_event_t *);
249 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
250 static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
251                               struct msghdr *, struct iovec *, size_t *);
252 static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
253                               struct msghdr *, struct iovec *, size_t *);
254
255 #define SELECT_POKE_SHUTDOWN            (-1)
256 #define SELECT_POKE_NOTHING             (-2)
257 #define SELECT_POKE_READ                (-3)
258 #define SELECT_POKE_ACCEPT              (-3) /*%< Same as _READ */
259 #define SELECT_POKE_WRITE               (-4)
260 #define SELECT_POKE_CONNECT             (-4) /*%< Same as _WRITE */
261 #define SELECT_POKE_CLOSE               (-5)
262
263 #define SOCK_DEAD(s)                    ((s)->references == 0)
264
265 static void
266 manager_log(isc_socketmgr_t *sockmgr,
267             isc_logcategory_t *category, isc_logmodule_t *module, int level,
268             const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
269 static void
270 manager_log(isc_socketmgr_t *sockmgr,
271             isc_logcategory_t *category, isc_logmodule_t *module, int level,
272             const char *fmt, ...)
273 {
274         char msgbuf[2048];
275         va_list ap;
276
277         if (! isc_log_wouldlog(isc_lctx, level))
278                 return;
279
280         va_start(ap, fmt);
281         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
282         va_end(ap);
283
284         isc_log_write(isc_lctx, category, module, level,
285                       "sockmgr %p: %s", sockmgr, msgbuf);
286 }
287
288 static void
289 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
290            isc_logcategory_t *category, isc_logmodule_t *module, int level,
291            isc_msgcat_t *msgcat, int msgset, int message,
292            const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
293 static void
294 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
295            isc_logcategory_t *category, isc_logmodule_t *module, int level,
296            isc_msgcat_t *msgcat, int msgset, int message,
297            const char *fmt, ...)
298 {
299         char msgbuf[2048];
300         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
301         va_list ap;
302
303         if (! isc_log_wouldlog(isc_lctx, level))
304                 return;
305
306         va_start(ap, fmt);
307         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
308         va_end(ap);
309
310         if (address == NULL) {
311                 isc_log_iwrite(isc_lctx, category, module, level,
312                                msgcat, msgset, message,
313                                "socket %p: %s", sock, msgbuf);
314         } else {
315                 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
316                 isc_log_iwrite(isc_lctx, category, module, level,
317                                msgcat, msgset, message,
318                                "socket %p %s: %s", sock, peerbuf, msgbuf);
319         }
320 }
321
322 static void
323 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
324         isc_socket_t *sock;
325
326         /*
327          * This is a wakeup on a socket.  If the socket is not in the
328          * process of being closed, start watching it for either reads
329          * or writes.
330          */
331
332         INSIST(fd >= 0 && fd < (int)FD_SETSIZE);
333
334         if (manager->fdstate[fd] == CLOSE_PENDING) {
335                 manager->fdstate[fd] = CLOSED;
336                 FD_CLR(fd, &manager->read_fds);
337                 FD_CLR(fd, &manager->write_fds);
338                 (void)close(fd);
339                 return;
340         }
341         if (manager->fdstate[fd] != MANAGED)
342                 return;
343
344         sock = manager->fds[fd];
345
346         /*
347          * Set requested bit.
348          */
349         if (msg == SELECT_POKE_READ)
350                 FD_SET(sock->fd, &manager->read_fds);
351         if (msg == SELECT_POKE_WRITE)
352                 FD_SET(sock->fd, &manager->write_fds);
353 }
354
355 #ifdef ISC_PLATFORM_USETHREADS
356 /*
357  * Poke the select loop when there is something for us to do.
358  * The write is required (by POSIX) to complete.  That is, we
359  * will not get partial writes.
360  */
361 static void
362 select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
363         int cc;
364         int buf[2];
365         char strbuf[ISC_STRERRORSIZE];
366
367         buf[0] = fd;
368         buf[1] = msg;
369
370         do {
371                 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
372 #ifdef ENOSR
373                 /*
374                  * Treat ENOSR as EAGAIN but loop slowly as it is
375                  * unlikely to clear fast.
376                  */
377                 if (cc < 0 && errno == ENOSR) {
378                         sleep(1);
379                         errno = EAGAIN;
380                 }
381 #endif
382         } while (cc < 0 && SOFT_ERROR(errno));
383
384         if (cc < 0) {
385                 isc__strerror(errno, strbuf, sizeof(strbuf));
386                 FATAL_ERROR(__FILE__, __LINE__,
387                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
388                                            ISC_MSG_WRITEFAILED,
389                                            "write() failed "
390                                            "during watcher poke: %s"),
391                             strbuf);
392         }
393
394         INSIST(cc == sizeof(buf));
395 }
396
397 /*
398  * Read a message on the internal fd.
399  */
400 static void
401 select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
402         int buf[2];
403         int cc;
404         char strbuf[ISC_STRERRORSIZE];
405
406         cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
407         if (cc < 0) {
408                 *msg = SELECT_POKE_NOTHING;
409                 *fd = -1;       /* Silence compiler. */
410                 if (SOFT_ERROR(errno))
411                         return;
412
413                 isc__strerror(errno, strbuf, sizeof(strbuf));
414                 FATAL_ERROR(__FILE__, __LINE__,
415                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
416                                            ISC_MSG_READFAILED,
417                                            "read() failed "
418                                            "during watcher poke: %s"),
419                             strbuf);
420                 
421                 return;
422         }
423         INSIST(cc == sizeof(buf));
424
425         *fd = buf[0];
426         *msg = buf[1];
427 }
428 #else /* ISC_PLATFORM_USETHREADS */
429 /*
430  * Update the state of the socketmgr when something changes.
431  */
432 static void
433 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
434         if (msg == SELECT_POKE_SHUTDOWN)
435                 return;
436         else if (fd >= 0)
437                 wakeup_socket(manager, fd, msg);
438         return;
439 }
440 #endif /* ISC_PLATFORM_USETHREADS */
441
442 /*
443  * Make a fd non-blocking.
444  */
445 static isc_result_t
446 make_nonblock(int fd) {
447         int ret;
448         int flags;
449         char strbuf[ISC_STRERRORSIZE];
450 #ifdef USE_FIONBIO_IOCTL
451         int on = 1;
452
453         ret = ioctl(fd, FIONBIO, (char *)&on);
454 #else
455         flags = fcntl(fd, F_GETFL, 0);
456         flags |= PORT_NONBLOCK;
457         ret = fcntl(fd, F_SETFL, flags);
458 #endif
459
460         if (ret == -1) {
461                 isc__strerror(errno, strbuf, sizeof(strbuf));
462                 UNEXPECTED_ERROR(__FILE__, __LINE__,
463 #ifdef USE_FIONBIO_IOCTL
464                                  "ioctl(%d, FIONBIO, &on): %s", fd,
465 #else
466                                  "fcntl(%d, F_SETFL, %d): %s", fd, flags,
467 #endif
468                                  strbuf);
469
470                 return (ISC_R_UNEXPECTED);
471         }
472
473         return (ISC_R_SUCCESS);
474 }
475
476 #ifdef USE_CMSG
477 /*
478  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
479  * In order to ensure as much portability as possible, we provide wrapper
480  * functions of these macros.
481  * Note that cmsg_space() could run slow on OSes that do not have
482  * CMSG_SPACE.
483  */
484 static inline ISC_SOCKADDR_LEN_T
485 cmsg_len(ISC_SOCKADDR_LEN_T len) {
486 #ifdef CMSG_LEN
487         return (CMSG_LEN(len));
488 #else
489         ISC_SOCKADDR_LEN_T hdrlen;
490
491         /*
492          * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
493          * is correct.
494          */
495         hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
496         return (hdrlen + len);
497 #endif
498 }
499
500 static inline ISC_SOCKADDR_LEN_T
501 cmsg_space(ISC_SOCKADDR_LEN_T len) {
502 #ifdef CMSG_SPACE
503         return (CMSG_SPACE(len));
504 #else
505         struct msghdr msg;
506         struct cmsghdr *cmsgp;
507         /*
508          * XXX: The buffer length is an ad-hoc value, but should be enough
509          * in a practical sense.
510          */
511         char dummybuf[sizeof(struct cmsghdr) + 1024];
512
513         memset(&msg, 0, sizeof(msg));
514         msg.msg_control = dummybuf;
515         msg.msg_controllen = sizeof(dummybuf);
516
517         cmsgp = (struct cmsghdr *)dummybuf;
518         cmsgp->cmsg_len = cmsg_len(len);
519
520         cmsgp = CMSG_NXTHDR(&msg, cmsgp);
521         if (cmsgp != NULL)
522                 return ((char *)cmsgp - (char *)msg.msg_control);
523         else
524                 return (0);
525 #endif  
526 }
527 #endif /* USE_CMSG */
528
529 /*
530  * Process control messages received on a socket.
531  */
532 static void
533 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
534 #ifdef USE_CMSG
535         struct cmsghdr *cmsgp;
536 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
537         struct in6_pktinfo *pktinfop;
538 #endif
539 #ifdef SO_TIMESTAMP
540         struct timeval *timevalp;
541 #endif
542 #endif
543
544         /*
545          * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
546          * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
547          * They are all here, outside of the CPP tests, because it is
548          * more consistent with the usual ISC coding style.
549          */
550         UNUSED(sock);
551         UNUSED(msg);
552         UNUSED(dev);
553
554 #ifdef ISC_NET_BSD44MSGHDR
555
556 #ifdef MSG_TRUNC
557         if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
558                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
559 #endif
560
561 #ifdef MSG_CTRUNC
562         if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
563                 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
564 #endif
565
566 #ifndef USE_CMSG
567         return;
568 #else
569         if (msg->msg_controllen == 0U || msg->msg_control == NULL)
570                 return;
571
572 #ifdef SO_TIMESTAMP
573         timevalp = NULL;
574 #endif
575 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
576         pktinfop = NULL;
577 #endif
578
579         cmsgp = CMSG_FIRSTHDR(msg);
580         while (cmsgp != NULL) {
581                 socket_log(sock, NULL, TRACE,
582                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
583                            "processing cmsg %p", cmsgp);
584
585 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
586                 if (cmsgp->cmsg_level == IPPROTO_IPV6
587                     && cmsgp->cmsg_type == IPV6_PKTINFO) {
588
589                         pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
590                         memcpy(&dev->pktinfo, pktinfop,
591                                sizeof(struct in6_pktinfo));
592                         dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
593                         socket_log(sock, NULL, TRACE,
594                                    isc_msgcat, ISC_MSGSET_SOCKET,
595                                    ISC_MSG_IFRECEIVED,
596                                    "interface received on ifindex %u",
597                                    dev->pktinfo.ipi6_ifindex);
598                         if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
599                                 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;                         
600                         goto next;
601                 }
602 #endif
603
604 #ifdef SO_TIMESTAMP
605                 if (cmsgp->cmsg_level == SOL_SOCKET
606                     && cmsgp->cmsg_type == SCM_TIMESTAMP) {
607                         timevalp = (struct timeval *)CMSG_DATA(cmsgp);
608                         dev->timestamp.seconds = timevalp->tv_sec;
609                         dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
610                         dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
611                         goto next;
612                 }
613 #endif
614
615         next:
616                 cmsgp = CMSG_NXTHDR(msg, cmsgp);
617         }
618 #endif /* USE_CMSG */
619
620 #endif /* ISC_NET_BSD44MSGHDR */
621 }
622
623 /*
624  * Construct an iov array and attach it to the msghdr passed in.  This is
625  * the SEND constructor, which will use the used region of the buffer
626  * (if using a buffer list) or will use the internal region (if a single
627  * buffer I/O is requested).
628  *
629  * Nothing can be NULL, and the done event must list at least one buffer
630  * on the buffer linked list for this function to be meaningful.
631  *
632  * If write_countp != NULL, *write_countp will hold the number of bytes
633  * this transaction can send.
634  */
635 static void
636 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
637                   struct msghdr *msg, struct iovec *iov, size_t *write_countp)
638 {
639         unsigned int iovcount;
640         isc_buffer_t *buffer;
641         isc_region_t used;
642         size_t write_count;
643         size_t skip_count;
644
645         memset(msg, 0, sizeof(*msg));
646
647         if (sock->type == isc_sockettype_udp) {
648                 msg->msg_name = (void *)&dev->address.type.sa;
649                 msg->msg_namelen = dev->address.length;
650         } else {
651                 msg->msg_name = NULL;
652                 msg->msg_namelen = 0;
653         }
654
655         buffer = ISC_LIST_HEAD(dev->bufferlist);
656         write_count = 0;
657         iovcount = 0;
658
659         /*
660          * Single buffer I/O?  Skip what we've done so far in this region.
661          */
662         if (buffer == NULL) {
663                 write_count = dev->region.length - dev->n;
664                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
665                 iov[0].iov_len = write_count;
666                 iovcount = 1;
667
668                 goto config;
669         }
670
671         /*
672          * Multibuffer I/O.
673          * Skip the data in the buffer list that we have already written.
674          */
675         skip_count = dev->n;
676         while (buffer != NULL) {
677                 REQUIRE(ISC_BUFFER_VALID(buffer));
678                 if (skip_count < isc_buffer_usedlength(buffer))
679                         break;
680                 skip_count -= isc_buffer_usedlength(buffer);
681                 buffer = ISC_LIST_NEXT(buffer, link);
682         }
683
684         while (buffer != NULL) {
685                 INSIST(iovcount < MAXSCATTERGATHER_SEND);
686
687                 isc_buffer_usedregion(buffer, &used);
688
689                 if (used.length > 0) {
690                         iov[iovcount].iov_base = (void *)(used.base
691                                                           + skip_count);
692                         iov[iovcount].iov_len = used.length - skip_count;
693                         write_count += (used.length - skip_count);
694                         skip_count = 0;
695                         iovcount++;
696                 }
697                 buffer = ISC_LIST_NEXT(buffer, link);
698         }
699
700         INSIST(skip_count == 0U);
701
702  config:
703         msg->msg_iov = iov;
704         msg->msg_iovlen = iovcount;
705
706 #ifdef ISC_NET_BSD44MSGHDR
707         msg->msg_control = NULL;
708         msg->msg_controllen = 0;
709         msg->msg_flags = 0;
710 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
711         if ((sock->type == isc_sockettype_udp)
712             && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
713                 struct cmsghdr *cmsgp;
714                 struct in6_pktinfo *pktinfop;
715
716                 socket_log(sock, NULL, TRACE,
717                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
718                            "sendto pktinfo data, ifindex %u",
719                            dev->pktinfo.ipi6_ifindex);
720
721                 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
722                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
723                 msg->msg_control = (void *)sock->sendcmsgbuf;
724
725                 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
726                 cmsgp->cmsg_level = IPPROTO_IPV6;
727                 cmsgp->cmsg_type = IPV6_PKTINFO;
728                 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
729                 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
730                 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
731         }
732 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
733 #else /* ISC_NET_BSD44MSGHDR */
734         msg->msg_accrights = NULL;
735         msg->msg_accrightslen = 0;
736 #endif /* ISC_NET_BSD44MSGHDR */
737
738         if (write_countp != NULL)
739                 *write_countp = write_count;
740 }
741
742 /*
743  * Construct an iov array and attach it to the msghdr passed in.  This is
744  * the RECV constructor, which will use the avialable region of the buffer
745  * (if using a buffer list) or will use the internal region (if a single
746  * buffer I/O is requested).
747  *
748  * Nothing can be NULL, and the done event must list at least one buffer
749  * on the buffer linked list for this function to be meaningful.
750  *
751  * If read_countp != NULL, *read_countp will hold the number of bytes
752  * this transaction can receive.
753  */
754 static void
755 build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
756                   struct msghdr *msg, struct iovec *iov, size_t *read_countp)
757 {
758         unsigned int iovcount;
759         isc_buffer_t *buffer;
760         isc_region_t available;
761         size_t read_count;
762
763         memset(msg, 0, sizeof(struct msghdr));
764
765         if (sock->type == isc_sockettype_udp) {
766                 memset(&dev->address, 0, sizeof(dev->address));
767 #ifdef BROKEN_RECVMSG
768                 if (sock->pf == AF_INET) {
769                         msg->msg_name = (void *)&dev->address.type.sin;
770                         msg->msg_namelen = sizeof(dev->address.type.sin6);
771                 } else if (sock->pf == AF_INET6) {
772                         msg->msg_name = (void *)&dev->address.type.sin6;
773                         msg->msg_namelen = sizeof(dev->address.type.sin6);
774 #ifdef ISC_PLATFORM_HAVESYSUNH
775                 } else if (sock->pf == AF_UNIX) {
776                         msg->msg_name = (void *)&dev->address.type.sunix;
777                         msg->msg_namelen = sizeof(dev->address.type.sunix);
778 #endif
779                 } else {
780                         msg->msg_name = (void *)&dev->address.type.sa;
781                         msg->msg_namelen = sizeof(dev->address.type);
782                 }
783 #else
784                 msg->msg_name = (void *)&dev->address.type.sa;
785                 msg->msg_namelen = sizeof(dev->address.type);
786 #endif
787 #ifdef ISC_NET_RECVOVERFLOW
788                 /* If needed, steal one iovec for overflow detection. */
789                 maxiov--;
790 #endif
791         } else { /* TCP */
792                 msg->msg_name = NULL;
793                 msg->msg_namelen = 0;
794                 dev->address = sock->address;
795         }
796
797         buffer = ISC_LIST_HEAD(dev->bufferlist);
798         read_count = 0;
799
800         /*
801          * Single buffer I/O?  Skip what we've done so far in this region.
802          */
803         if (buffer == NULL) {
804                 read_count = dev->region.length - dev->n;
805                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
806                 iov[0].iov_len = read_count;
807                 iovcount = 1;
808
809                 goto config;
810         }
811
812         /*
813          * Multibuffer I/O.
814          * Skip empty buffers.
815          */
816         while (buffer != NULL) {
817                 REQUIRE(ISC_BUFFER_VALID(buffer));
818                 if (isc_buffer_availablelength(buffer) != 0)
819                         break;
820                 buffer = ISC_LIST_NEXT(buffer, link);
821         }
822
823         iovcount = 0;
824         while (buffer != NULL) {
825                 INSIST(iovcount < MAXSCATTERGATHER_RECV);
826
827                 isc_buffer_availableregion(buffer, &available);
828
829                 if (available.length > 0) {
830                         iov[iovcount].iov_base = (void *)(available.base);
831                         iov[iovcount].iov_len = available.length;
832                         read_count += available.length;
833                         iovcount++;
834                 }
835                 buffer = ISC_LIST_NEXT(buffer, link);
836         }
837
838  config:
839
840         /*
841          * If needed, set up to receive that one extra byte.  Note that
842          * we know there is at least one iov left, since we stole it
843          * at the top of this function.
844          */
845 #ifdef ISC_NET_RECVOVERFLOW
846         if (sock->type == isc_sockettype_udp) {
847                 iov[iovcount].iov_base = (void *)(&sock->overflow);
848                 iov[iovcount].iov_len = 1;
849                 iovcount++;
850         }
851 #endif
852
853         msg->msg_iov = iov;
854         msg->msg_iovlen = iovcount;
855
856 #ifdef ISC_NET_BSD44MSGHDR
857         msg->msg_control = NULL;
858         msg->msg_controllen = 0;
859         msg->msg_flags = 0;
860 #if defined(USE_CMSG)
861         if (sock->type == isc_sockettype_udp) {
862                 msg->msg_control = sock->recvcmsgbuf;
863                 msg->msg_controllen = sock->recvcmsgbuflen;
864         }
865 #endif /* USE_CMSG */
866 #else /* ISC_NET_BSD44MSGHDR */
867         msg->msg_accrights = NULL;
868         msg->msg_accrightslen = 0;
869 #endif /* ISC_NET_BSD44MSGHDR */
870
871         if (read_countp != NULL)
872                 *read_countp = read_count;
873 }
874
875 static void
876 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
877                 isc_socketevent_t *dev)
878 {
879         if (sock->type == isc_sockettype_udp) {
880                 if (address != NULL)
881                         dev->address = *address;
882                 else
883                         dev->address = sock->address;
884         } else if (sock->type == isc_sockettype_tcp) {
885                 INSIST(address == NULL);
886                 dev->address = sock->address;
887         }
888 }
889
890 static void
891 destroy_socketevent(isc_event_t *event) {
892         isc_socketevent_t *ev = (isc_socketevent_t *)event;
893
894         INSIST(ISC_LIST_EMPTY(ev->bufferlist));
895
896         (ev->destroy)(event);
897 }
898
899 static isc_socketevent_t *
900 allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
901                      isc_taskaction_t action, const void *arg)
902 {
903         isc_socketevent_t *ev;
904
905         ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
906                                                      sock, eventtype,
907                                                      action, arg,
908                                                      sizeof(*ev));
909
910         if (ev == NULL)
911                 return (NULL);
912
913         ev->result = ISC_R_UNEXPECTED;
914         ISC_LINK_INIT(ev, ev_link);
915         ISC_LIST_INIT(ev->bufferlist);
916         ev->region.base = NULL;
917         ev->n = 0;
918         ev->offset = 0;
919         ev->attributes = 0;
920         ev->destroy = ev->ev_destroy;
921         ev->ev_destroy = destroy_socketevent;
922
923         return (ev);
924 }
925
926 #if defined(ISC_SOCKET_DEBUG)
927 static void
928 dump_msg(struct msghdr *msg) {
929         unsigned int i;
930
931         printf("MSGHDR %p\n", msg);
932         printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
933         printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
934         for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
935                 printf("\t\t%d\tbase %p, len %d\n", i,
936                        msg->msg_iov[i].iov_base,
937                        msg->msg_iov[i].iov_len);
938 #ifdef ISC_NET_BSD44MSGHDR
939         printf("\tcontrol %p, controllen %d\n", msg->msg_control,
940                msg->msg_controllen);
941 #endif
942 }
943 #endif
944
945 #define DOIO_SUCCESS            0       /* i/o ok, event sent */
946 #define DOIO_SOFT               1       /* i/o ok, soft error, no event sent */
947 #define DOIO_HARD               2       /* i/o error, event sent */
948 #define DOIO_EOF                3       /* EOF, no event sent */
949
950 static int
951 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
952         int cc;
953         struct iovec iov[MAXSCATTERGATHER_RECV];
954         size_t read_count;
955         size_t actual_count;
956         struct msghdr msghdr;
957         isc_buffer_t *buffer;
958         int recv_errno;
959         char strbuf[ISC_STRERRORSIZE];
960
961         build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
962
963 #if defined(ISC_SOCKET_DEBUG)
964         dump_msg(&msghdr);
965 #endif
966
967         cc = recvmsg(sock->fd, &msghdr, 0);
968         recv_errno = errno;
969
970 #if defined(ISC_SOCKET_DEBUG)
971         dump_msg(&msghdr);
972 #endif
973
974         if (cc < 0) {
975                 if (SOFT_ERROR(recv_errno))
976                         return (DOIO_SOFT);
977
978                 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
979                         isc__strerror(recv_errno, strbuf, sizeof(strbuf));
980                         socket_log(sock, NULL, IOEVENT,
981                                    isc_msgcat, ISC_MSGSET_SOCKET,
982                                    ISC_MSG_DOIORECV, 
983                                   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
984                                    sock->fd, cc, recv_errno, strbuf);
985                 }
986
987 #define SOFT_OR_HARD(_system, _isc) \
988         if (recv_errno == _system) { \
989                 if (sock->connected) { \
990                         dev->result = _isc; \
991                         return (DOIO_HARD); \
992                 } \
993                 return (DOIO_SOFT); \
994         }
995 #define ALWAYS_HARD(_system, _isc) \
996         if (recv_errno == _system) { \
997                 dev->result = _isc; \
998                 return (DOIO_HARD); \
999         }
1000
1001                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1002                 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1003                 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1004                 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1005                 /* HPUX 11.11 can return EADDRNOTAVAIL. */
1006                 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1007                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1008
1009 #undef SOFT_OR_HARD
1010 #undef ALWAYS_HARD
1011
1012                 dev->result = isc__errno2result(recv_errno);
1013                 return (DOIO_HARD);
1014         }
1015
1016         /*
1017          * On TCP, zero length reads indicate EOF, while on
1018          * UDP, zero length reads are perfectly valid, although
1019          * strange.
1020          */
1021         if ((sock->type == isc_sockettype_tcp) && (cc == 0))
1022                 return (DOIO_EOF);
1023
1024         if (sock->type == isc_sockettype_udp) {
1025                 dev->address.length = msghdr.msg_namelen;
1026                 if (isc_sockaddr_getport(&dev->address) == 0) {
1027                         if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1028                                 socket_log(sock, &dev->address, IOEVENT,
1029                                            isc_msgcat, ISC_MSGSET_SOCKET,
1030                                            ISC_MSG_ZEROPORT, 
1031                                            "dropping source port zero packet");
1032                         }
1033                         return (DOIO_SOFT);
1034                 }
1035         }
1036
1037         socket_log(sock, &dev->address, IOEVENT,
1038                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1039                    "packet received correctly");
1040
1041         /*
1042          * Overflow bit detection.  If we received MORE bytes than we should,
1043          * this indicates an overflow situation.  Set the flag in the
1044          * dev entry and adjust how much we read by one.
1045          */
1046 #ifdef ISC_NET_RECVOVERFLOW
1047         if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1048                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1049                 cc--;
1050         }
1051 #endif
1052
1053         /*
1054          * If there are control messages attached, run through them and pull
1055          * out the interesting bits.
1056          */
1057         if (sock->type == isc_sockettype_udp)
1058                 process_cmsg(sock, &msghdr, dev);
1059
1060         /*
1061          * update the buffers (if any) and the i/o count
1062          */
1063         dev->n += cc;
1064         actual_count = cc;
1065         buffer = ISC_LIST_HEAD(dev->bufferlist);
1066         while (buffer != NULL && actual_count > 0U) {
1067                 REQUIRE(ISC_BUFFER_VALID(buffer));
1068                 if (isc_buffer_availablelength(buffer) <= actual_count) {
1069                         actual_count -= isc_buffer_availablelength(buffer);
1070                         isc_buffer_add(buffer,
1071                                        isc_buffer_availablelength(buffer));
1072                 } else {
1073                         isc_buffer_add(buffer, actual_count);
1074                         actual_count = 0;
1075                         break;
1076                 }
1077                 buffer = ISC_LIST_NEXT(buffer, link);
1078                 if (buffer == NULL) {
1079                         INSIST(actual_count == 0U);
1080                 }
1081         }
1082
1083         /*
1084          * If we read less than we expected, update counters,
1085          * and let the upper layer poke the descriptor.
1086          */
1087         if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1088                 return (DOIO_SOFT);
1089
1090         /*
1091          * Full reads are posted, or partials if partials are ok.
1092          */
1093         dev->result = ISC_R_SUCCESS;
1094         return (DOIO_SUCCESS);
1095 }
1096
1097 /*
1098  * Returns:
1099  *      DOIO_SUCCESS    The operation succeeded.  dev->result contains
1100  *                      ISC_R_SUCCESS.
1101  *
1102  *      DOIO_HARD       A hard or unexpected I/O error was encountered.
1103  *                      dev->result contains the appropriate error.
1104  *
1105  *      DOIO_SOFT       A soft I/O error was encountered.  No senddone
1106  *                      event was sent.  The operation should be retried.
1107  *
1108  *      No other return values are possible.
1109  */
1110 static int
1111 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1112         int cc;
1113         struct iovec iov[MAXSCATTERGATHER_SEND];
1114         size_t write_count;
1115         struct msghdr msghdr;
1116         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1117         int attempts = 0;
1118         int send_errno;
1119         char strbuf[ISC_STRERRORSIZE];
1120
1121         build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1122
1123  resend:
1124         cc = sendmsg(sock->fd, &msghdr, 0);
1125         send_errno = errno;
1126
1127         /*
1128          * Check for error or block condition.
1129          */
1130         if (cc < 0) {
1131                 if (send_errno == EINTR && ++attempts < NRETRIES)
1132                         goto resend;
1133
1134                 if (SOFT_ERROR(send_errno))
1135                         return (DOIO_SOFT);
1136
1137 #define SOFT_OR_HARD(_system, _isc) \
1138         if (send_errno == _system) { \
1139                 if (sock->connected) { \
1140                         dev->result = _isc; \
1141                         return (DOIO_HARD); \
1142                 } \
1143                 return (DOIO_SOFT); \
1144         }
1145 #define ALWAYS_HARD(_system, _isc) \
1146         if (send_errno == _system) { \
1147                 dev->result = _isc; \
1148                 return (DOIO_HARD); \
1149         }
1150
1151                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1152                 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1153                 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1154                 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1155                 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1156 #ifdef EHOSTDOWN
1157                 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1158 #endif
1159                 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1160                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1161                 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1162                 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1163                 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1164
1165 #undef SOFT_OR_HARD
1166 #undef ALWAYS_HARD
1167
1168                 /*
1169                  * The other error types depend on whether or not the
1170                  * socket is UDP or TCP.  If it is UDP, some errors
1171                  * that we expect to be fatal under TCP are merely
1172                  * annoying, and are really soft errors.
1173                  *
1174                  * However, these soft errors are still returned as
1175                  * a status.
1176                  */
1177                 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1178                 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1179                 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1180                                  addrbuf, strbuf);
1181                 dev->result = isc__errno2result(send_errno);
1182                 return (DOIO_HARD);
1183         }
1184
1185         if (cc == 0)
1186                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1187                                  "internal_send: send() %s 0",
1188                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1189                                                 ISC_MSG_RETURNED, "returned"));
1190
1191         /*
1192          * If we write less than we expected, update counters, poke.
1193          */
1194         dev->n += cc;
1195         if ((size_t)cc != write_count)
1196                 return (DOIO_SOFT);
1197
1198         /*
1199          * Exactly what we wanted to write.  We're done with this
1200          * entry.  Post its completion event.
1201          */
1202         dev->result = ISC_R_SUCCESS;
1203         return (DOIO_SUCCESS);
1204 }
1205
1206 /*
1207  * Kill.
1208  *
1209  * Caller must ensure that the socket is not locked and no external
1210  * references exist.
1211  */
1212 static void
1213 destroy(isc_socket_t **sockp) {
1214         isc_socket_t *sock = *sockp;
1215         isc_socketmgr_t *manager = sock->manager;
1216
1217         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1218                    ISC_MSG_DESTROYING, "destroying");
1219
1220         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1221         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1222         INSIST(ISC_LIST_EMPTY(sock->send_list));
1223         INSIST(sock->connect_ev == NULL);
1224         REQUIRE(sock->fd >= 0 && sock->fd < (int)FD_SETSIZE);
1225
1226         LOCK(&manager->lock);
1227
1228         /*
1229          * No one has this socket open, so the watcher doesn't have to be
1230          * poked, and the socket doesn't have to be locked.
1231          */
1232         manager->fds[sock->fd] = NULL;
1233         manager->fdstate[sock->fd] = CLOSE_PENDING;
1234         select_poke(manager, sock->fd, SELECT_POKE_CLOSE);
1235         ISC_LIST_UNLINK(manager->socklist, sock, link);
1236
1237 #ifdef ISC_PLATFORM_USETHREADS
1238         if (ISC_LIST_EMPTY(manager->socklist))
1239                 SIGNAL(&manager->shutdown_ok);
1240 #endif /* ISC_PLATFORM_USETHREADS */
1241
1242         /*
1243          * XXX should reset manager->maxfd here
1244          */
1245
1246         UNLOCK(&manager->lock);
1247
1248         free_socket(sockp);
1249 }
1250
1251 static isc_result_t
1252 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1253                 isc_socket_t **socketp)
1254 {
1255         isc_socket_t *sock;
1256         isc_result_t result;
1257         ISC_SOCKADDR_LEN_T cmsgbuflen;
1258
1259         sock = isc_mem_get(manager->mctx, sizeof(*sock));
1260
1261         if (sock == NULL)
1262                 return (ISC_R_NOMEMORY);
1263
1264         result = ISC_R_UNEXPECTED;
1265
1266         sock->magic = 0;
1267         sock->references = 0;
1268
1269         sock->manager = manager;
1270         sock->type = type;
1271         sock->fd = -1;
1272
1273         ISC_LINK_INIT(sock, link);
1274
1275         sock->recvcmsgbuf = NULL;
1276         sock->sendcmsgbuf = NULL;
1277
1278         /*
1279          * set up cmsg buffers
1280          */
1281         cmsgbuflen = 0;
1282 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1283         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1284 #endif
1285 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
1286         cmsgbuflen += cmsg_space(sizeof(struct timeval));
1287 #endif
1288         sock->recvcmsgbuflen = cmsgbuflen;
1289         if (sock->recvcmsgbuflen != 0U) {
1290                 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1291                 if (sock->recvcmsgbuf == NULL)
1292                         goto error;
1293         }
1294
1295         cmsgbuflen = 0;
1296 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1297         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1298 #endif
1299         sock->sendcmsgbuflen = cmsgbuflen;
1300         if (sock->sendcmsgbuflen != 0U) {
1301                 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1302                 if (sock->sendcmsgbuf == NULL)
1303                         goto error;
1304         }
1305
1306         /*
1307          * set up list of readers and writers to be initially empty
1308          */
1309         ISC_LIST_INIT(sock->recv_list);
1310         ISC_LIST_INIT(sock->send_list);
1311         ISC_LIST_INIT(sock->accept_list);
1312         sock->connect_ev = NULL;
1313         sock->pending_recv = 0;
1314         sock->pending_send = 0;
1315         sock->pending_accept = 0;
1316         sock->listener = 0;
1317         sock->connected = 0;
1318         sock->connecting = 0;
1319         sock->bound = 0;
1320
1321         /*
1322          * initialize the lock
1323          */
1324         result = isc_mutex_init(&sock->lock);
1325         if (result != ISC_R_SUCCESS) {
1326                 sock->magic = 0;
1327                 goto error;
1328         }
1329
1330         /*
1331          * Initialize readable and writable events
1332          */
1333         ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1334                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1335                        NULL, sock, sock, NULL, NULL);
1336         ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1337                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1338                        NULL, sock, sock, NULL, NULL);
1339
1340         sock->magic = SOCKET_MAGIC;
1341         *socketp = sock;
1342
1343         return (ISC_R_SUCCESS);
1344
1345  error:
1346         if (sock->recvcmsgbuf != NULL)
1347                 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
1348                             sock->recvcmsgbuflen);
1349         if (sock->sendcmsgbuf != NULL)
1350                 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
1351                             sock->sendcmsgbuflen);
1352         isc_mem_put(manager->mctx, sock, sizeof(*sock));
1353
1354         return (result);
1355 }
1356
1357 /*
1358  * This event requires that the various lists be empty, that the reference
1359  * count be 1, and that the magic number is valid.  The other socket bits,
1360  * like the lock, must be initialized as well.  The fd associated must be
1361  * marked as closed, by setting it to -1 on close, or this routine will
1362  * also close the socket.
1363  */
1364 static void
1365 free_socket(isc_socket_t **socketp) {
1366         isc_socket_t *sock = *socketp;
1367
1368         INSIST(sock->references == 0);
1369         INSIST(VALID_SOCKET(sock));
1370         INSIST(!sock->connecting);
1371         INSIST(!sock->pending_recv);
1372         INSIST(!sock->pending_send);
1373         INSIST(!sock->pending_accept);
1374         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1375         INSIST(ISC_LIST_EMPTY(sock->send_list));
1376         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1377         INSIST(!ISC_LINK_LINKED(sock, link));
1378
1379         if (sock->recvcmsgbuf != NULL)
1380                 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
1381                             sock->recvcmsgbuflen);
1382         if (sock->sendcmsgbuf != NULL)
1383                 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
1384                             sock->sendcmsgbuflen);
1385
1386         sock->magic = 0;
1387
1388         DESTROYLOCK(&sock->lock);
1389
1390         isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1391
1392         *socketp = NULL;
1393 }
1394
1395 #ifdef SO_BSDCOMPAT
1396 /*
1397  * This really should not be necessary to do.  Having to workout
1398  * which kernel version we are on at run time so that we don't cause
1399  * the kernel to issue a warning about us using a deprecated socket option.
1400  * Such warnings should *never* be on by default in production kernels.
1401  *
1402  * We can't do this a build time because executables are moved between
1403  * machines and hence kernels.
1404  *
1405  * We can't just not set SO_BSDCOMAT because some kernels require it.
1406  */
1407
1408 static isc_once_t         bsdcompat_once = ISC_ONCE_INIT;
1409 isc_boolean_t bsdcompat = ISC_TRUE;
1410
1411 static void
1412 clear_bsdcompat(void) {
1413 #ifdef __linux__
1414          struct utsname buf;
1415          char *endp;
1416          long int major;
1417          long int minor;
1418
1419          uname(&buf);    /* Can only fail if buf is bad in Linux. */
1420
1421          /* Paranoia in parsing can be increased, but we trust uname(). */
1422          major = strtol(buf.release, &endp, 10);
1423          if (*endp == '.') {
1424                 minor = strtol(endp+1, &endp, 10);
1425                 if ((major > 2) || ((major == 2) && (minor >= 4))) {
1426                         bsdcompat = ISC_FALSE;
1427                 }
1428          }
1429 #endif /* __linux __ */
1430 }
1431 #endif
1432
1433 /*%
1434  * Create a new 'type' socket managed by 'manager'.  Events
1435  * will be posted to 'task' and when dispatched 'action' will be
1436  * called with 'arg' as the arg value.  The new socket is returned
1437  * in 'socketp'.
1438  */
1439 isc_result_t
1440 isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1441                   isc_socket_t **socketp)
1442 {
1443         isc_socket_t *sock = NULL;
1444         isc_result_t result;
1445 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
1446         int on = 1;
1447 #endif
1448 #if defined(SO_RCVBUF)
1449         ISC_SOCKADDR_LEN_T optlen;
1450         int size;
1451 #endif
1452         char strbuf[ISC_STRERRORSIZE];
1453         const char *err = "socket";
1454         int try = 0;
1455
1456         REQUIRE(VALID_MANAGER(manager));
1457         REQUIRE(socketp != NULL && *socketp == NULL);
1458
1459         result = allocate_socket(manager, type, &sock);
1460         if (result != ISC_R_SUCCESS)
1461                 return (result);
1462
1463         sock->pf = pf;
1464  again:
1465         switch (type) {
1466         case isc_sockettype_udp:
1467                 sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1468                 break;
1469         case isc_sockettype_tcp:
1470                 sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1471                 break;
1472         case isc_sockettype_unix:
1473                 sock->fd = socket(pf, SOCK_STREAM, 0);
1474                 break;
1475         }
1476         if (sock->fd == -1 && errno == EINTR && try++ < 42)
1477                 goto again;
1478
1479 #ifdef F_DUPFD
1480         /*
1481          * Leave a space for stdio to work in.
1482          */
1483         if (sock->fd >= 0 && sock->fd < 20) {
1484                 int new, tmp;
1485                 new = fcntl(sock->fd, F_DUPFD, 20);
1486                 tmp = errno;
1487                 (void)close(sock->fd);
1488                 errno = tmp;
1489                 sock->fd = new;
1490                 err = "isc_socket_create: fcntl";
1491         }
1492 #endif
1493
1494         if (sock->fd >= (int)FD_SETSIZE) {
1495                 (void)close(sock->fd);
1496                 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1497                                ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1498                                isc_msgcat, ISC_MSGSET_SOCKET,
1499                                ISC_MSG_TOOMANYFDS,
1500                                "%s: too many open file descriptors", "socket");
1501                 free_socket(&sock);
1502                 return (ISC_R_NORESOURCES);
1503         }
1504         
1505         if (sock->fd < 0) {
1506                 free_socket(&sock);
1507
1508                 switch (errno) {
1509                 case EMFILE:
1510                 case ENFILE:
1511                 case ENOBUFS:
1512                         return (ISC_R_NORESOURCES);
1513
1514                 case EPROTONOSUPPORT:
1515                 case EPFNOSUPPORT:
1516                 case EAFNOSUPPORT:
1517                 /*
1518                  * Linux 2.2 (and maybe others) return EINVAL instead of
1519                  * EAFNOSUPPORT.
1520                  */
1521                 case EINVAL:
1522                         return (ISC_R_FAMILYNOSUPPORT);
1523
1524                 default:
1525                         isc__strerror(errno, strbuf, sizeof(strbuf));
1526                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1527                                          "%s() %s: %s", err,
1528                                          isc_msgcat_get(isc_msgcat,
1529                                                         ISC_MSGSET_GENERAL,
1530                                                         ISC_MSG_FAILED,
1531                                                         "failed"),
1532                                          strbuf);
1533                         return (ISC_R_UNEXPECTED);
1534                 }
1535         }
1536
1537         if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
1538                 (void)close(sock->fd);
1539                 free_socket(&sock);
1540                 return (ISC_R_UNEXPECTED);
1541         }
1542
1543 #ifdef SO_BSDCOMPAT
1544         RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
1545                                   clear_bsdcompat) == ISC_R_SUCCESS);
1546         if (type != isc_sockettype_unix && bsdcompat &&
1547             setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
1548                        (void *)&on, sizeof(on)) < 0) {
1549                 isc__strerror(errno, strbuf, sizeof(strbuf));
1550                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1551                                  "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
1552                                  sock->fd,
1553                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1554                                                 ISC_MSG_FAILED, "failed"),
1555                                  strbuf);
1556                 /* Press on... */
1557         }
1558 #endif
1559
1560 #if defined(USE_CMSG) || defined(SO_RCVBUF)
1561         if (type == isc_sockettype_udp) {
1562
1563 #if defined(USE_CMSG)
1564 #if defined(SO_TIMESTAMP)
1565                 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1566                                (void *)&on, sizeof(on)) < 0
1567                     && errno != ENOPROTOOPT) {
1568                         isc__strerror(errno, strbuf, sizeof(strbuf));
1569                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1570                                          "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1571                                          sock->fd, 
1572                                          isc_msgcat_get(isc_msgcat,
1573                                                         ISC_MSGSET_GENERAL,
1574                                                         ISC_MSG_FAILED,
1575                                                         "failed"),
1576                                          strbuf);
1577                         /* Press on... */
1578                 }
1579 #endif /* SO_TIMESTAMP */
1580
1581 #if defined(ISC_PLATFORM_HAVEIPV6)
1582                 if (pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
1583                         /*
1584                          * Warn explicitly because this anomaly can be hidden
1585                          * in usual operation (and unexpectedly appear later).
1586                          */
1587                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1588                                          "No buffer available to receive "
1589                                          "IPv6 destination");
1590                 }
1591 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1592 #ifdef IPV6_RECVPKTINFO
1593                 /* RFC 3542 */
1594                 if ((pf == AF_INET6)
1595                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1596                                    (void *)&on, sizeof(on)) < 0)) {
1597                         isc__strerror(errno, strbuf, sizeof(strbuf));
1598                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1599                                          "setsockopt(%d, IPV6_RECVPKTINFO) "
1600                                          "%s: %s", sock->fd,
1601                                          isc_msgcat_get(isc_msgcat,
1602                                                         ISC_MSGSET_GENERAL,
1603                                                         ISC_MSG_FAILED,
1604                                                         "failed"),
1605                                          strbuf);
1606                 }
1607 #else
1608                 /* RFC 2292 */
1609                 if ((pf == AF_INET6)
1610                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1611                                    (void *)&on, sizeof(on)) < 0)) {
1612                         isc__strerror(errno, strbuf, sizeof(strbuf));
1613                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1614                                          "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1615                                          sock->fd,
1616                                          isc_msgcat_get(isc_msgcat,
1617                                                         ISC_MSGSET_GENERAL,
1618                                                         ISC_MSG_FAILED,
1619                                                         "failed"),
1620                                          strbuf);
1621                 }
1622 #endif /* IPV6_RECVPKTINFO */
1623 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
1624 #ifdef IPV6_USE_MIN_MTU        /* RFC 3542, not too common yet*/
1625                 /* use minimum MTU */
1626                 if (pf == AF_INET6) {
1627                         (void)setsockopt(sock->fd, IPPROTO_IPV6,
1628                                          IPV6_USE_MIN_MTU,
1629                                          (void *)&on, sizeof(on));
1630                 }
1631 #endif
1632 #endif /* ISC_PLATFORM_HAVEIPV6 */
1633 #endif /* defined(USE_CMSG) */
1634
1635 #if defined(SO_RCVBUF)
1636                 optlen = sizeof(size);
1637                 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1638                                (void *)&size, &optlen) >= 0 &&
1639                      size < RCVBUFSIZE) {
1640                         size = RCVBUFSIZE;
1641                         if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1642                                        (void *)&size, sizeof(size)) == -1) {
1643                                 isc__strerror(errno, strbuf, sizeof(strbuf));
1644                                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1645                                         "setsockopt(%d, SO_RCVBUF, %d) %s: %s",
1646                                         sock->fd, size,
1647                                         isc_msgcat_get(isc_msgcat,
1648                                                        ISC_MSGSET_GENERAL,
1649                                                        ISC_MSG_FAILED,
1650                                                        "failed"),
1651                                         strbuf);
1652                         }
1653                 }
1654 #endif
1655         }
1656 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
1657
1658         sock->references = 1;
1659         *socketp = sock;
1660
1661         LOCK(&manager->lock);
1662
1663         /*
1664          * Note we don't have to lock the socket like we normally would because
1665          * there are no external references to it yet.
1666          */
1667
1668         manager->fds[sock->fd] = sock;
1669         manager->fdstate[sock->fd] = MANAGED;
1670         ISC_LIST_APPEND(manager->socklist, sock, link);
1671         if (manager->maxfd < sock->fd)
1672                 manager->maxfd = sock->fd;
1673
1674         UNLOCK(&manager->lock);
1675
1676         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1677                    ISC_MSG_CREATED, "created");
1678
1679         return (ISC_R_SUCCESS);
1680 }
1681
1682 /*
1683  * Attach to a socket.  Caller must explicitly detach when it is done.
1684  */
1685 void
1686 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1687         REQUIRE(VALID_SOCKET(sock));
1688         REQUIRE(socketp != NULL && *socketp == NULL);
1689
1690         LOCK(&sock->lock);
1691         sock->references++;
1692         UNLOCK(&sock->lock);
1693
1694         *socketp = sock;
1695 }
1696
1697 /*
1698  * Dereference a socket.  If this is the last reference to it, clean things
1699  * up by destroying the socket.
1700  */
1701 void
1702 isc_socket_detach(isc_socket_t **socketp) {
1703         isc_socket_t *sock;
1704         isc_boolean_t kill_socket = ISC_FALSE;
1705
1706         REQUIRE(socketp != NULL);
1707         sock = *socketp;
1708         REQUIRE(VALID_SOCKET(sock));
1709
1710         LOCK(&sock->lock);
1711         REQUIRE(sock->references > 0);
1712         sock->references--;
1713         if (sock->references == 0)
1714                 kill_socket = ISC_TRUE;
1715         UNLOCK(&sock->lock);
1716
1717         if (kill_socket)
1718                 destroy(&sock);
1719
1720         *socketp = NULL;
1721 }
1722
1723 /*
1724  * I/O is possible on a given socket.  Schedule an event to this task that
1725  * will call an internal function to do the I/O.  This will charge the
1726  * task with the I/O operation and let our select loop handler get back
1727  * to doing something real as fast as possible.
1728  *
1729  * The socket and manager must be locked before calling this function.
1730  */
1731 static void
1732 dispatch_recv(isc_socket_t *sock) {
1733         intev_t *iev;
1734         isc_socketevent_t *ev;
1735
1736         INSIST(!sock->pending_recv);
1737
1738         ev = ISC_LIST_HEAD(sock->recv_list);
1739         if (ev == NULL)
1740                 return;
1741
1742         sock->pending_recv = 1;
1743         iev = &sock->readable_ev;
1744
1745         socket_log(sock, NULL, EVENT, NULL, 0, 0,
1746                    "dispatch_recv:  event %p -> task %p", ev, ev->ev_sender);
1747
1748         sock->references++;
1749         iev->ev_sender = sock;
1750         iev->ev_action = internal_recv;
1751         iev->ev_arg = sock;
1752
1753         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1754 }
1755
1756 static void
1757 dispatch_send(isc_socket_t *sock) {
1758         intev_t *iev;
1759         isc_socketevent_t *ev;
1760
1761         INSIST(!sock->pending_send);
1762
1763         ev = ISC_LIST_HEAD(sock->send_list);
1764         if (ev == NULL)
1765                 return;
1766
1767         sock->pending_send = 1;
1768         iev = &sock->writable_ev;
1769
1770         socket_log(sock, NULL, EVENT, NULL, 0, 0,
1771                    "dispatch_send:  event %p -> task %p", ev, ev->ev_sender);
1772
1773         sock->references++;
1774         iev->ev_sender = sock;
1775         iev->ev_action = internal_send;
1776         iev->ev_arg = sock;
1777
1778         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1779 }
1780
1781 /*
1782  * Dispatch an internal accept event.
1783  */
1784 static void
1785 dispatch_accept(isc_socket_t *sock) {
1786         intev_t *iev;
1787         isc_socket_newconnev_t *ev;
1788
1789         INSIST(!sock->pending_accept);
1790
1791         /*
1792          * Are there any done events left, or were they all canceled
1793          * before the manager got the socket lock?
1794          */
1795         ev = ISC_LIST_HEAD(sock->accept_list);
1796         if (ev == NULL)
1797                 return;
1798
1799         sock->pending_accept = 1;
1800         iev = &sock->readable_ev;
1801
1802         sock->references++;  /* keep socket around for this internal event */
1803         iev->ev_sender = sock;
1804         iev->ev_action = internal_accept;
1805         iev->ev_arg = sock;
1806
1807         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1808 }
1809
1810 static void
1811 dispatch_connect(isc_socket_t *sock) {
1812         intev_t *iev;
1813         isc_socket_connev_t *ev;
1814
1815         iev = &sock->writable_ev;
1816
1817         ev = sock->connect_ev;
1818         INSIST(ev != NULL); /* XXX */
1819
1820         INSIST(sock->connecting);
1821
1822         sock->references++;  /* keep socket around for this internal event */
1823         iev->ev_sender = sock;
1824         iev->ev_action = internal_connect;
1825         iev->ev_arg = sock;
1826
1827         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1828 }
1829
1830 /*
1831  * Dequeue an item off the given socket's read queue, set the result code
1832  * in the done event to the one provided, and send it to the task it was
1833  * destined for.
1834  *
1835  * If the event to be sent is on a list, remove it before sending.  If
1836  * asked to, send and detach from the socket as well.
1837  *
1838  * Caller must have the socket locked if the event is attached to the socket.
1839  */
1840 static void
1841 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1842         isc_task_t *task;
1843
1844         task = (*dev)->ev_sender;
1845
1846         (*dev)->ev_sender = sock;
1847
1848         if (ISC_LINK_LINKED(*dev, ev_link))
1849                 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1850
1851         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1852             == ISC_SOCKEVENTATTR_ATTACHED)
1853                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1854         else
1855                 isc_task_send(task, (isc_event_t **)dev);
1856 }
1857
1858 /*
1859  * See comments for send_recvdone_event() above.
1860  *
1861  * Caller must have the socket locked if the event is attached to the socket.
1862  */
1863 static void
1864 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1865         isc_task_t *task;
1866
1867         INSIST(dev != NULL && *dev != NULL);
1868
1869         task = (*dev)->ev_sender;
1870         (*dev)->ev_sender = sock;
1871
1872         if (ISC_LINK_LINKED(*dev, ev_link))
1873                 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1874
1875         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1876             == ISC_SOCKEVENTATTR_ATTACHED)
1877                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1878         else
1879                 isc_task_send(task, (isc_event_t **)dev);
1880 }
1881
1882 /*
1883  * Call accept() on a socket, to get the new file descriptor.  The listen
1884  * socket is used as a prototype to create a new isc_socket_t.  The new
1885  * socket has one outstanding reference.  The task receiving the event
1886  * will be detached from just after the event is delivered.
1887  *
1888  * On entry to this function, the event delivered is the internal
1889  * readable event, and the first item on the accept_list should be
1890  * the done event we want to send.  If the list is empty, this is a no-op,
1891  * so just unlock and return.
1892  */
1893 static void
1894 internal_accept(isc_task_t *me, isc_event_t *ev) {
1895         isc_socket_t *sock;
1896         isc_socketmgr_t *manager;
1897         isc_socket_newconnev_t *dev;
1898         isc_task_t *task;
1899         ISC_SOCKADDR_LEN_T addrlen;
1900         int fd;
1901         isc_result_t result = ISC_R_SUCCESS;
1902         char strbuf[ISC_STRERRORSIZE];
1903         const char *err = "accept";
1904
1905         UNUSED(me);
1906
1907         sock = ev->ev_sender;
1908         INSIST(VALID_SOCKET(sock));
1909
1910         LOCK(&sock->lock);
1911         socket_log(sock, NULL, TRACE,
1912                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1913                    "internal_accept called, locked socket");
1914
1915         manager = sock->manager;
1916         INSIST(VALID_MANAGER(manager));
1917
1918         INSIST(sock->listener);
1919         INSIST(sock->pending_accept == 1);
1920         sock->pending_accept = 0;
1921
1922         INSIST(sock->references > 0);
1923         sock->references--;  /* the internal event is done with this socket */
1924         if (sock->references == 0) {
1925                 UNLOCK(&sock->lock);
1926                 destroy(&sock);
1927                 return;
1928         }
1929
1930         /*
1931          * Get the first item off the accept list.
1932          * If it is empty, unlock the socket and return.
1933          */
1934         dev = ISC_LIST_HEAD(sock->accept_list);
1935         if (dev == NULL) {
1936                 UNLOCK(&sock->lock);
1937                 return;
1938         }
1939
1940         /*
1941          * Try to accept the new connection.  If the accept fails with
1942          * EAGAIN or EINTR, simply poke the watcher to watch this socket
1943          * again.  Also ignore ECONNRESET, which has been reported to
1944          * be spuriously returned on Linux 2.2.19 although it is not
1945          * a documented error for accept().  ECONNABORTED has been
1946          * reported for Solaris 8.  The rest are thrown in not because
1947          * we have seen them but because they are ignored by other
1948          * deamons such as BIND 8 and Apache.
1949          */
1950
1951         addrlen = sizeof(dev->newsocket->address.type);
1952         memset(&dev->newsocket->address.type.sa, 0, addrlen);
1953         fd = accept(sock->fd, &dev->newsocket->address.type.sa,
1954                     (void *)&addrlen);
1955
1956 #ifdef F_DUPFD
1957         /*
1958          * Leave a space for stdio to work in.
1959          */
1960         if (fd >= 0 && fd < 20) {
1961                 int new, tmp;
1962                 new = fcntl(fd, F_DUPFD, 20);
1963                 tmp = errno;
1964                 (void)close(fd);
1965                 errno = tmp;
1966                 fd = new;
1967                 err = "fcntl";
1968         }
1969 #endif
1970
1971         if (fd < 0) {
1972                 if (SOFT_ERROR(errno))
1973                         goto soft_error;
1974                 switch (errno) {
1975                 case ENOBUFS:
1976                 case ENFILE:
1977                 case ENOMEM:
1978                 case ECONNRESET:
1979                 case ECONNABORTED:
1980                 case EHOSTUNREACH:
1981                 case EHOSTDOWN:
1982                 case ENETUNREACH:
1983                 case ENETDOWN:
1984                 case ECONNREFUSED:
1985 #ifdef EPROTO
1986                 case EPROTO:
1987 #endif
1988 #ifdef ENONET
1989                 case ENONET:
1990 #endif
1991                         goto soft_error;
1992                 default:
1993                         break;
1994                 }
1995                 isc__strerror(errno, strbuf, sizeof(strbuf));
1996                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1997                                  "internal_accept: %s() %s: %s", err,
1998                                  isc_msgcat_get(isc_msgcat,
1999                                                 ISC_MSGSET_GENERAL,
2000                                                 ISC_MSG_FAILED,
2001                                                 "failed"),
2002                                  strbuf);
2003                 fd = -1;
2004                 result = ISC_R_UNEXPECTED;
2005         } else {
2006                 if (addrlen == 0U) {
2007                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2008                                          "internal_accept(): "
2009                                          "accept() failed to return "
2010                                          "remote address");
2011
2012                         (void)close(fd);
2013                         goto soft_error;
2014                 } else if (dev->newsocket->address.type.sa.sa_family !=
2015                            sock->pf)
2016                 {
2017                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2018                                          "internal_accept(): "
2019                                          "accept() returned peer address "
2020                                          "family %u (expected %u)", 
2021                                          dev->newsocket->address.
2022                                          type.sa.sa_family,
2023                                          sock->pf);
2024                         (void)close(fd);
2025                         goto soft_error;
2026                 } else if (fd >= (int)FD_SETSIZE) {
2027                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2028                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2029                                        isc_msgcat, ISC_MSGSET_SOCKET,
2030                                        ISC_MSG_TOOMANYFDS,
2031                                        "%s: too many open file descriptors",
2032                                        "accept");
2033                         (void)close(fd);
2034                         goto soft_error;
2035                 }
2036         }
2037
2038         if (fd != -1) {
2039                 dev->newsocket->address.length = addrlen;
2040                 dev->newsocket->pf = sock->pf;
2041         }
2042
2043         /*
2044          * Pull off the done event.
2045          */
2046         ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2047
2048         /*
2049          * Poke watcher if there are more pending accepts.
2050          */
2051         if (!ISC_LIST_EMPTY(sock->accept_list))
2052                 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2053
2054         UNLOCK(&sock->lock);
2055
2056         if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) {
2057                 (void)close(fd);
2058                 fd = -1;
2059                 result = ISC_R_UNEXPECTED;
2060         }
2061
2062         /*
2063          * -1 means the new socket didn't happen.
2064          */
2065         if (fd != -1) {
2066                 LOCK(&manager->lock);
2067                 ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
2068
2069                 dev->newsocket->fd = fd;
2070                 dev->newsocket->bound = 1;
2071                 dev->newsocket->connected = 1;
2072
2073                 /*
2074                  * Save away the remote address
2075                  */
2076                 dev->address = dev->newsocket->address;
2077
2078                 manager->fds[fd] = dev->newsocket;
2079                 manager->fdstate[fd] = MANAGED;
2080                 if (manager->maxfd < fd)
2081                         manager->maxfd = fd;
2082
2083                 socket_log(sock, &dev->newsocket->address, CREATION,
2084                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2085                            "accepted connection, new socket %p",
2086                            dev->newsocket);
2087
2088                 UNLOCK(&manager->lock);
2089         } else {
2090                 dev->newsocket->references--;
2091                 free_socket(&dev->newsocket);
2092         }
2093         
2094         /*
2095          * Fill in the done event details and send it off.
2096          */
2097         dev->result = result;
2098         task = dev->ev_sender;
2099         dev->ev_sender = sock;
2100
2101         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2102         return;
2103
2104  soft_error:
2105         select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2106         UNLOCK(&sock->lock);
2107         return;
2108 }
2109
2110 static void
2111 internal_recv(isc_task_t *me, isc_event_t *ev) {
2112         isc_socketevent_t *dev;
2113         isc_socket_t *sock;
2114
2115         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
2116
2117         sock = ev->ev_sender;
2118         INSIST(VALID_SOCKET(sock));
2119
2120         LOCK(&sock->lock);
2121         socket_log(sock, NULL, IOEVENT,
2122                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2123                    "internal_recv: task %p got event %p", me, ev);
2124
2125         INSIST(sock->pending_recv == 1);
2126         sock->pending_recv = 0;
2127
2128         INSIST(sock->references > 0);
2129         sock->references--;  /* the internal event is done with this socket */
2130         if (sock->references == 0) {
2131                 UNLOCK(&sock->lock);
2132                 destroy(&sock);
2133                 return;
2134         }
2135
2136         /*
2137          * Try to do as much I/O as possible on this socket.  There are no
2138          * limits here, currently.
2139          */
2140         dev = ISC_LIST_HEAD(sock->recv_list);
2141         while (dev != NULL) {
2142                 switch (doio_recv(sock, dev)) {
2143                 case DOIO_SOFT:
2144                         goto poke;
2145
2146                 case DOIO_EOF:
2147                         /*
2148                          * read of 0 means the remote end was closed.
2149                          * Run through the event queue and dispatch all
2150                          * the events with an EOF result code.
2151                          */
2152                         do {
2153                                 dev->result = ISC_R_EOF;
2154                                 send_recvdone_event(sock, &dev);
2155                                 dev = ISC_LIST_HEAD(sock->recv_list);
2156                         } while (dev != NULL);
2157                         goto poke;
2158
2159                 case DOIO_SUCCESS:
2160                 case DOIO_HARD:
2161                         send_recvdone_event(sock, &dev);
2162                         break;
2163                 }
2164
2165                 dev = ISC_LIST_HEAD(sock->recv_list);
2166         }
2167
2168  poke:
2169         if (!ISC_LIST_EMPTY(sock->recv_list))
2170                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2171
2172         UNLOCK(&sock->lock);
2173 }
2174
2175 static void
2176 internal_send(isc_task_t *me, isc_event_t *ev) {
2177         isc_socketevent_t *dev;
2178         isc_socket_t *sock;
2179
2180         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2181
2182         /*
2183          * Find out what socket this is and lock it.
2184          */
2185         sock = (isc_socket_t *)ev->ev_sender;
2186         INSIST(VALID_SOCKET(sock));
2187
2188         LOCK(&sock->lock);
2189         socket_log(sock, NULL, IOEVENT,
2190                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2191                    "internal_send: task %p got event %p", me, ev);
2192
2193         INSIST(sock->pending_send == 1);
2194         sock->pending_send = 0;
2195
2196         INSIST(sock->references > 0);
2197         sock->references--;  /* the internal event is done with this socket */
2198         if (sock->references == 0) {
2199                 UNLOCK(&sock->lock);
2200                 destroy(&sock);
2201                 return;
2202         }
2203
2204         /*
2205          * Try to do as much I/O as possible on this socket.  There are no
2206          * limits here, currently.
2207          */
2208         dev = ISC_LIST_HEAD(sock->send_list);
2209         while (dev != NULL) {
2210                 switch (doio_send(sock, dev)) {
2211                 case DOIO_SOFT:
2212                         goto poke;
2213
2214                 case DOIO_HARD:
2215                 case DOIO_SUCCESS:
2216                         send_senddone_event(sock, &dev);
2217                         break;
2218                 }
2219
2220                 dev = ISC_LIST_HEAD(sock->send_list);
2221         }
2222
2223  poke:
2224         if (!ISC_LIST_EMPTY(sock->send_list))
2225                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2226
2227         UNLOCK(&sock->lock);
2228 }
2229
2230 static void
2231 process_fds(isc_socketmgr_t *manager, int maxfd,
2232             fd_set *readfds, fd_set *writefds)
2233 {
2234         int i;
2235         isc_socket_t *sock;
2236         isc_boolean_t unlock_sock;
2237
2238         REQUIRE(maxfd <= (int)FD_SETSIZE);
2239
2240         /*
2241          * Process read/writes on other fds here.  Avoid locking
2242          * and unlocking twice if both reads and writes are possible.
2243          */
2244         for (i = 0; i < maxfd; i++) {
2245 #ifdef ISC_PLATFORM_USETHREADS
2246                 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
2247                         continue;
2248 #endif /* ISC_PLATFORM_USETHREADS */
2249
2250                 if (manager->fdstate[i] == CLOSE_PENDING) {
2251                         manager->fdstate[i] = CLOSED;
2252                         FD_CLR(i, &manager->read_fds);
2253                         FD_CLR(i, &manager->write_fds);
2254
2255                         (void)close(i);
2256
2257                         continue;
2258                 }
2259
2260                 sock = manager->fds[i];
2261                 unlock_sock = ISC_FALSE;
2262                 if (FD_ISSET(i, readfds)) {
2263                         if (sock == NULL) {
2264                                 FD_CLR(i, &manager->read_fds);
2265                                 goto check_write;
2266                         }
2267                         unlock_sock = ISC_TRUE;
2268                         LOCK(&sock->lock);
2269                         if (!SOCK_DEAD(sock)) {
2270                                 if (sock->listener)
2271                                         dispatch_accept(sock);
2272                                 else
2273                                         dispatch_recv(sock);
2274                         }
2275                         FD_CLR(i, &manager->read_fds);
2276                 }
2277         check_write:
2278                 if (FD_ISSET(i, writefds)) {
2279                         if (sock == NULL) {
2280                                 FD_CLR(i, &manager->write_fds);
2281                                 continue;
2282                         }
2283                         if (!unlock_sock) {
2284                                 unlock_sock = ISC_TRUE;
2285                                 LOCK(&sock->lock);
2286                         }
2287                         if (!SOCK_DEAD(sock)) {
2288                                 if (sock->connecting)
2289                                         dispatch_connect(sock);
2290                                 else
2291                                         dispatch_send(sock);
2292                         }
2293                         FD_CLR(i, &manager->write_fds);
2294                 }
2295                 if (unlock_sock)
2296                         UNLOCK(&sock->lock);
2297         }
2298 }
2299
2300 #ifdef ISC_PLATFORM_USETHREADS
2301 /*
2302  * This is the thread that will loop forever, always in a select or poll
2303  * call.
2304  *
2305  * When select returns something to do, track down what thread gets to do
2306  * this I/O and post the event to it.
2307  */
2308 static isc_threadresult_t
2309 watcher(void *uap) {
2310         isc_socketmgr_t *manager = uap;
2311         isc_boolean_t done;
2312         int ctlfd;
2313         int cc;
2314         fd_set readfds;
2315         fd_set writefds;
2316         int msg, fd;
2317         int maxfd;
2318         char strbuf[ISC_STRERRORSIZE];
2319
2320         /*
2321          * Get the control fd here.  This will never change.
2322          */
2323         LOCK(&manager->lock);
2324         ctlfd = manager->pipe_fds[0];
2325
2326         done = ISC_FALSE;
2327         while (!done) {
2328                 do {
2329                         readfds = manager->read_fds;
2330                         writefds = manager->write_fds;
2331                         maxfd = manager->maxfd + 1;
2332
2333                         UNLOCK(&manager->lock);
2334
2335                         cc = select(maxfd, &readfds, &writefds, NULL, NULL);
2336                         if (cc < 0) {
2337                                 if (!SOFT_ERROR(errno)) {
2338                                         isc__strerror(errno, strbuf,
2339                                                       sizeof(strbuf));
2340                                         FATAL_ERROR(__FILE__, __LINE__,
2341                                                     "select() %s: %s",
2342                                                     isc_msgcat_get(isc_msgcat,
2343                                                             ISC_MSGSET_GENERAL,
2344                                                             ISC_MSG_FAILED,
2345                                                             "failed"),
2346                                                     strbuf);
2347                                 }
2348                         }
2349
2350                         LOCK(&manager->lock);
2351                 } while (cc < 0);
2352
2353
2354                 /*
2355                  * Process reads on internal, control fd.
2356                  */
2357                 if (FD_ISSET(ctlfd, &readfds)) {
2358                         for (;;) {
2359                                 select_readmsg(manager, &fd, &msg);
2360
2361                                 manager_log(manager, IOEVENT,
2362                                             isc_msgcat_get(isc_msgcat,
2363                                                      ISC_MSGSET_SOCKET,
2364                                                      ISC_MSG_WATCHERMSG,
2365                                                      "watcher got message %d"),
2366                                                      msg);
2367
2368                                 /*
2369                                  * Nothing to read?
2370                                  */
2371                                 if (msg == SELECT_POKE_NOTHING)
2372                                         break;
2373
2374                                 /*
2375                                  * Handle shutdown message.  We really should
2376                                  * jump out of this loop right away, but
2377                                  * it doesn't matter if we have to do a little
2378                                  * more work first.
2379                                  */
2380                                 if (msg == SELECT_POKE_SHUTDOWN) {
2381                                         done = ISC_TRUE;
2382
2383                                         break;
2384                                 }
2385
2386                                 /*
2387                                  * This is a wakeup on a socket.  Look
2388                                  * at the event queue for both read and write,
2389                                  * and decide if we need to watch on it now
2390                                  * or not.
2391                                  */
2392                                 wakeup_socket(manager, fd, msg);
2393                         }
2394                 }
2395
2396                 process_fds(manager, maxfd, &readfds, &writefds);
2397         }
2398
2399         manager_log(manager, TRACE,
2400                     isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2401                                    ISC_MSG_EXITING, "watcher exiting"));
2402
2403         UNLOCK(&manager->lock);
2404         return ((isc_threadresult_t)0);
2405 }
2406 #endif /* ISC_PLATFORM_USETHREADS */
2407
2408 /*
2409  * Create a new socket manager.
2410  */
2411 isc_result_t
2412 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2413         isc_socketmgr_t *manager;
2414 #ifdef ISC_PLATFORM_USETHREADS
2415         char strbuf[ISC_STRERRORSIZE];
2416 #endif
2417         isc_result_t result;
2418
2419         REQUIRE(managerp != NULL && *managerp == NULL);
2420
2421 #ifndef ISC_PLATFORM_USETHREADS
2422         if (socketmgr != NULL) {
2423                 socketmgr->refs++;
2424                 *managerp = socketmgr;
2425                 return (ISC_R_SUCCESS);
2426         }
2427 #endif /* ISC_PLATFORM_USETHREADS */
2428
2429         manager = isc_mem_get(mctx, sizeof(*manager));
2430         if (manager == NULL)
2431                 return (ISC_R_NOMEMORY);
2432
2433         manager->magic = SOCKET_MANAGER_MAGIC;
2434         manager->mctx = NULL;
2435         memset(manager->fds, 0, sizeof(manager->fds));
2436         ISC_LIST_INIT(manager->socklist);
2437         result = isc_mutex_init(&manager->lock);
2438         if (result != ISC_R_SUCCESS) {
2439                 isc_mem_put(mctx, manager, sizeof(*manager));
2440                 return (result);
2441         }
2442 #ifdef ISC_PLATFORM_USETHREADS
2443         if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2444                 DESTROYLOCK(&manager->lock);
2445                 isc_mem_put(mctx, manager, sizeof(*manager));
2446                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2447                                  "isc_condition_init() %s",
2448                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2449                                                 ISC_MSG_FAILED, "failed"));
2450                 return (ISC_R_UNEXPECTED);
2451         }
2452
2453         /*
2454          * Create the special fds that will be used to wake up the
2455          * select/poll loop when something internal needs to be done.
2456          */
2457         if (pipe(manager->pipe_fds) != 0) {
2458                 DESTROYLOCK(&manager->lock);
2459                 isc_mem_put(mctx, manager, sizeof(*manager));
2460                 isc__strerror(errno, strbuf, sizeof(strbuf));
2461                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2462                                  "pipe() %s: %s",
2463                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2464                                                 ISC_MSG_FAILED, "failed"),
2465                                  strbuf);
2466
2467                 return (ISC_R_UNEXPECTED);
2468         }
2469
2470         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
2471 #if 0
2472         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
2473 #endif
2474 #else /* ISC_PLATFORM_USETHREADS */
2475         manager->refs = 1;
2476 #endif /* ISC_PLATFORM_USETHREADS */
2477
2478         /*
2479          * Set up initial state for the select loop
2480          */
2481         FD_ZERO(&manager->read_fds);
2482         FD_ZERO(&manager->write_fds);
2483 #ifdef ISC_PLATFORM_USETHREADS
2484         FD_SET(manager->pipe_fds[0], &manager->read_fds);
2485         manager->maxfd = manager->pipe_fds[0];
2486 #else /* ISC_PLATFORM_USETHREADS */
2487         manager->maxfd = 0;
2488 #endif /* ISC_PLATFORM_USETHREADS */
2489         memset(manager->fdstate, 0, sizeof(manager->fdstate));
2490
2491 #ifdef ISC_PLATFORM_USETHREADS
2492         /*
2493          * Start up the select/poll thread.
2494          */
2495         if (isc_thread_create(watcher, manager, &manager->watcher) !=
2496             ISC_R_SUCCESS) {
2497                 (void)close(manager->pipe_fds[0]);
2498                 (void)close(manager->pipe_fds[1]);
2499                 DESTROYLOCK(&manager->lock);
2500                 isc_mem_put(mctx, manager, sizeof(*manager));
2501                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2502                                  "isc_thread_create() %s",
2503                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2504                                                 ISC_MSG_FAILED, "failed"));
2505                 return (ISC_R_UNEXPECTED);
2506         }
2507 #endif /* ISC_PLATFORM_USETHREADS */
2508         isc_mem_attach(mctx, &manager->mctx);
2509
2510 #ifndef ISC_PLATFORM_USETHREADS
2511         socketmgr = manager;
2512 #endif /* ISC_PLATFORM_USETHREADS */
2513         *managerp = manager;
2514
2515         return (ISC_R_SUCCESS);
2516 }
2517
2518 void
2519 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
2520         isc_socketmgr_t *manager;
2521         int i;
2522         isc_mem_t *mctx;
2523
2524         /*
2525          * Destroy a socket manager.
2526          */
2527
2528         REQUIRE(managerp != NULL);
2529         manager = *managerp;
2530         REQUIRE(VALID_MANAGER(manager));
2531
2532 #ifndef ISC_PLATFORM_USETHREADS
2533         if (manager->refs > 1) {
2534                 manager->refs--;
2535                 *managerp = NULL;
2536                 return;
2537         }
2538 #endif /* ISC_PLATFORM_USETHREADS */
2539
2540         LOCK(&manager->lock);
2541
2542 #ifdef ISC_PLATFORM_USETHREADS
2543         /*
2544          * Wait for all sockets to be destroyed.
2545          */
2546         while (!ISC_LIST_EMPTY(manager->socklist)) {
2547                 manager_log(manager, CREATION,
2548                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2549                                            ISC_MSG_SOCKETSREMAIN,
2550                                            "sockets exist"));
2551                 WAIT(&manager->shutdown_ok, &manager->lock);
2552         }
2553 #else /* ISC_PLATFORM_USETHREADS */
2554         /*
2555          * Hope all sockets have been destroyed.
2556          */
2557         if (!ISC_LIST_EMPTY(manager->socklist)) {
2558                 manager_log(manager, CREATION,
2559                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2560                                            ISC_MSG_SOCKETSREMAIN,
2561                                            "sockets exist"));
2562                 INSIST(0);
2563         }
2564 #endif /* ISC_PLATFORM_USETHREADS */
2565
2566         UNLOCK(&manager->lock);
2567
2568         /*
2569          * Here, poke our select/poll thread.  Do this by closing the write
2570          * half of the pipe, which will send EOF to the read half.
2571          * This is currently a no-op in the non-threaded case.
2572          */
2573         select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
2574
2575 #ifdef ISC_PLATFORM_USETHREADS
2576         /*
2577          * Wait for thread to exit.
2578          */
2579         if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
2580                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2581                                  "isc_thread_join() %s",
2582                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2583                                                 ISC_MSG_FAILED, "failed"));
2584 #endif /* ISC_PLATFORM_USETHREADS */
2585
2586         /*
2587          * Clean up.
2588          */
2589 #ifdef ISC_PLATFORM_USETHREADS
2590         (void)close(manager->pipe_fds[0]);
2591         (void)close(manager->pipe_fds[1]);
2592         (void)isc_condition_destroy(&manager->shutdown_ok);
2593 #endif /* ISC_PLATFORM_USETHREADS */
2594
2595         for (i = 0; i < (int)FD_SETSIZE; i++)
2596                 if (manager->fdstate[i] == CLOSE_PENDING)
2597                         (void)close(i);
2598
2599         DESTROYLOCK(&manager->lock);
2600         manager->magic = 0;
2601         mctx= manager->mctx;
2602         isc_mem_put(mctx, manager, sizeof(*manager));
2603
2604         isc_mem_detach(&mctx);
2605
2606         *managerp = NULL;
2607 }
2608
2609 static isc_result_t
2610 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2611             unsigned int flags)
2612 {
2613         int io_state;
2614         isc_boolean_t have_lock = ISC_FALSE;
2615         isc_task_t *ntask = NULL;
2616         isc_result_t result = ISC_R_SUCCESS;
2617
2618         dev->ev_sender = task;
2619
2620         if (sock->type == isc_sockettype_udp) {
2621                 io_state = doio_recv(sock, dev);
2622         } else {
2623                 LOCK(&sock->lock);
2624                 have_lock = ISC_TRUE;
2625
2626                 if (ISC_LIST_EMPTY(sock->recv_list))
2627                         io_state = doio_recv(sock, dev);
2628                 else
2629                         io_state = DOIO_SOFT;
2630         }
2631
2632         switch (io_state) {
2633         case DOIO_SOFT:
2634                 /*
2635                  * We couldn't read all or part of the request right now, so
2636                  * queue it.
2637                  *
2638                  * Attach to socket and to task
2639                  */
2640                 isc_task_attach(task, &ntask);
2641                 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2642
2643                 if (!have_lock) {
2644                         LOCK(&sock->lock);
2645                         have_lock = ISC_TRUE;
2646                 }
2647
2648                 /*
2649                  * Enqueue the request.  If the socket was previously not being
2650                  * watched, poke the watcher to start paying attention to it.
2651                  */
2652                 if (ISC_LIST_EMPTY(sock->recv_list))
2653                         select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2654                 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2655
2656                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2657                            "socket_recv: event %p -> task %p",
2658                            dev, ntask);
2659
2660                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2661                         result = ISC_R_INPROGRESS;
2662                 break;
2663
2664         case DOIO_EOF:
2665                 dev->result = ISC_R_EOF;
2666                 /* fallthrough */
2667
2668         case DOIO_HARD:
2669         case DOIO_SUCCESS:
2670                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2671                         send_recvdone_event(sock, &dev);
2672                 break;
2673         }
2674
2675         if (have_lock)
2676                 UNLOCK(&sock->lock);
2677
2678         return (result);
2679 }
2680
2681 isc_result_t
2682 isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2683                  unsigned int minimum, isc_task_t *task,
2684                  isc_taskaction_t action, const void *arg)
2685 {
2686         isc_socketevent_t *dev;
2687         isc_socketmgr_t *manager;
2688         unsigned int iocount;
2689         isc_buffer_t *buffer;
2690
2691         REQUIRE(VALID_SOCKET(sock));
2692         REQUIRE(buflist != NULL);
2693         REQUIRE(!ISC_LIST_EMPTY(*buflist));
2694         REQUIRE(task != NULL);
2695         REQUIRE(action != NULL);
2696
2697         manager = sock->manager;
2698         REQUIRE(VALID_MANAGER(manager));
2699
2700         iocount = isc_bufferlist_availablecount(buflist);
2701         REQUIRE(iocount > 0);
2702
2703         INSIST(sock->bound);
2704
2705         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2706         if (dev == NULL) {
2707                 return (ISC_R_NOMEMORY);
2708         }
2709
2710         /*
2711          * UDP sockets are always partial read
2712          */
2713         if (sock->type == isc_sockettype_udp)
2714                 dev->minimum = 1;
2715         else {
2716                 if (minimum == 0)
2717                         dev->minimum = iocount;
2718                 else
2719                         dev->minimum = minimum;
2720         }
2721
2722         /*
2723          * Move each buffer from the passed in list to our internal one.
2724          */
2725         buffer = ISC_LIST_HEAD(*buflist);
2726         while (buffer != NULL) {
2727                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
2728                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2729                 buffer = ISC_LIST_HEAD(*buflist);
2730         }
2731
2732         return (socket_recv(sock, dev, task, 0));
2733 }
2734
2735 isc_result_t
2736 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
2737                 isc_task_t *task, isc_taskaction_t action, const void *arg)
2738 {
2739         isc_socketevent_t *dev;
2740         isc_socketmgr_t *manager;
2741
2742         REQUIRE(VALID_SOCKET(sock));
2743         REQUIRE(action != NULL);
2744
2745         manager = sock->manager;
2746         REQUIRE(VALID_MANAGER(manager));
2747
2748         INSIST(sock->bound);
2749
2750         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2751         if (dev == NULL)
2752                 return (ISC_R_NOMEMORY);
2753
2754         return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
2755 }
2756
2757 isc_result_t
2758 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
2759                  unsigned int minimum, isc_task_t *task,
2760                  isc_socketevent_t *event, unsigned int flags)
2761 {
2762         event->ev_sender = sock;
2763         event->result = ISC_R_UNEXPECTED;
2764         ISC_LIST_INIT(event->bufferlist);
2765         event->region = *region;
2766         event->n = 0;
2767         event->offset = 0;
2768         event->attributes = 0;
2769
2770         /*
2771          * UDP sockets are always partial read.
2772          */
2773         if (sock->type == isc_sockettype_udp)
2774                 event->minimum = 1;
2775         else {
2776                 if (minimum == 0)
2777                         event->minimum = region->length;
2778                 else
2779                         event->minimum = minimum;
2780         }
2781
2782         return (socket_recv(sock, event, task, flags));
2783 }
2784
2785 static isc_result_t
2786 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2787             isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2788             unsigned int flags)
2789 {
2790         int io_state;
2791         isc_boolean_t have_lock = ISC_FALSE;
2792         isc_task_t *ntask = NULL;
2793         isc_result_t result = ISC_R_SUCCESS;
2794
2795         dev->ev_sender = task;
2796
2797         set_dev_address(address, sock, dev);
2798         if (pktinfo != NULL) {
2799                 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2800                 dev->pktinfo = *pktinfo;
2801
2802                 if (!isc_sockaddr_issitelocal(&dev->address) &&
2803                     !isc_sockaddr_islinklocal(&dev->address)) {
2804                         socket_log(sock, NULL, TRACE, isc_msgcat,
2805                                    ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
2806                                    "pktinfo structure provided, ifindex %u "
2807                                    "(set to 0)", pktinfo->ipi6_ifindex);
2808
2809                         /*
2810                          * Set the pktinfo index to 0 here, to let the
2811                          * kernel decide what interface it should send on.
2812                          */
2813                         dev->pktinfo.ipi6_ifindex = 0;
2814                 }
2815         }
2816
2817         if (sock->type == isc_sockettype_udp)
2818                 io_state = doio_send(sock, dev);
2819         else {
2820                 LOCK(&sock->lock);
2821                 have_lock = ISC_TRUE;
2822
2823                 if (ISC_LIST_EMPTY(sock->send_list))
2824                         io_state = doio_send(sock, dev);
2825                 else
2826                         io_state = DOIO_SOFT;
2827         }
2828
2829         switch (io_state) {
2830         case DOIO_SOFT:
2831                 /*
2832                  * We couldn't send all or part of the request right now, so
2833                  * queue it unless ISC_SOCKFLAG_NORETRY is set.
2834                  */
2835                 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2836                         isc_task_attach(task, &ntask);
2837                         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2838
2839                         if (!have_lock) {
2840                                 LOCK(&sock->lock);
2841                                 have_lock = ISC_TRUE;
2842                         }
2843
2844                         /*
2845                          * Enqueue the request.  If the socket was previously
2846                          * not being watched, poke the watcher to start
2847                          * paying attention to it.
2848                          */
2849                         if (ISC_LIST_EMPTY(sock->send_list))
2850                                 select_poke(sock->manager, sock->fd,
2851                                             SELECT_POKE_WRITE);
2852                         ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2853
2854                         socket_log(sock, NULL, EVENT, NULL, 0, 0,
2855                                    "socket_send: event %p -> task %p",
2856                                    dev, ntask);
2857
2858                         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2859                                 result = ISC_R_INPROGRESS;
2860                         break;
2861                 }
2862
2863         case DOIO_HARD:
2864         case DOIO_SUCCESS:
2865                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2866                         send_senddone_event(sock, &dev);
2867                 break;
2868         }
2869
2870         if (have_lock)
2871                 UNLOCK(&sock->lock);
2872
2873         return (result);
2874 }
2875
2876 isc_result_t
2877 isc_socket_send(isc_socket_t *sock, isc_region_t *region,
2878                 isc_task_t *task, isc_taskaction_t action, const void *arg)
2879 {
2880         /*
2881          * REQUIRE() checking is performed in isc_socket_sendto().
2882          */
2883         return (isc_socket_sendto(sock, region, task, action, arg, NULL,
2884                                   NULL));
2885 }
2886
2887 isc_result_t
2888 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
2889                   isc_task_t *task, isc_taskaction_t action, const void *arg,
2890                   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
2891 {
2892         isc_socketevent_t *dev;
2893         isc_socketmgr_t *manager;
2894
2895         REQUIRE(VALID_SOCKET(sock));
2896         REQUIRE(region != NULL);
2897         REQUIRE(task != NULL);
2898         REQUIRE(action != NULL);
2899
2900         manager = sock->manager;
2901         REQUIRE(VALID_MANAGER(manager));
2902
2903         INSIST(sock->bound);
2904
2905         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
2906         if (dev == NULL) {
2907                 return (ISC_R_NOMEMORY);
2908         }
2909
2910         dev->region = *region;
2911
2912         return (socket_send(sock, dev, task, address, pktinfo, 0));
2913 }
2914
2915 isc_result_t
2916 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2917                  isc_task_t *task, isc_taskaction_t action, const void *arg)
2918 {
2919         return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
2920                                    NULL));
2921 }
2922
2923 isc_result_t
2924 isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
2925                    isc_task_t *task, isc_taskaction_t action, const void *arg,
2926                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
2927 {
2928         isc_socketevent_t *dev;
2929         isc_socketmgr_t *manager;
2930         unsigned int iocount;
2931         isc_buffer_t *buffer;
2932
2933         REQUIRE(VALID_SOCKET(sock));
2934         REQUIRE(buflist != NULL);
2935         REQUIRE(!ISC_LIST_EMPTY(*buflist));
2936         REQUIRE(task != NULL);
2937         REQUIRE(action != NULL);
2938
2939         manager = sock->manager;
2940         REQUIRE(VALID_MANAGER(manager));
2941
2942         iocount = isc_bufferlist_usedcount(buflist);
2943         REQUIRE(iocount > 0);
2944
2945         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
2946         if (dev == NULL) {
2947                 return (ISC_R_NOMEMORY);
2948         }
2949
2950         /*
2951          * Move each buffer from the passed in list to our internal one.
2952          */
2953         buffer = ISC_LIST_HEAD(*buflist);
2954         while (buffer != NULL) {
2955                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
2956                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2957                 buffer = ISC_LIST_HEAD(*buflist);
2958         }
2959
2960         return (socket_send(sock, dev, task, address, pktinfo, 0));
2961 }
2962
2963 isc_result_t
2964 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
2965                    isc_task_t *task,
2966                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2967                    isc_socketevent_t *event, unsigned int flags)
2968 {
2969         REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
2970         if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
2971                 REQUIRE(sock->type == isc_sockettype_udp);
2972         event->ev_sender = sock;
2973         event->result = ISC_R_UNEXPECTED;
2974         ISC_LIST_INIT(event->bufferlist);
2975         event->region = *region;
2976         event->n = 0;
2977         event->offset = 0;
2978         event->attributes = 0;
2979
2980         return (socket_send(sock, event, task, address, pktinfo, flags));
2981 }
2982
2983 void
2984 isc_socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
2985 #ifdef ISC_PLATFORM_HAVESYSUNH
2986         int s;
2987         struct stat sb;
2988         char strbuf[ISC_STRERRORSIZE];
2989
2990         if (sockaddr->type.sa.sa_family != AF_UNIX)
2991                 return;
2992
2993 #ifndef S_ISSOCK
2994 #if defined(S_IFMT) && defined(S_IFSOCK)
2995 #define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
2996 #elif defined(_S_IFMT) && defined(S_IFSOCK)
2997 #define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
2998 #endif
2999 #endif
3000
3001 #ifndef S_ISFIFO
3002 #if defined(S_IFMT) && defined(S_IFIFO)
3003 #define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
3004 #elif defined(_S_IFMT) && defined(S_IFIFO)
3005 #define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
3006 #endif
3007 #endif
3008
3009 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
3010 #error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
3011 #endif
3012
3013 #ifndef S_ISFIFO
3014 #define S_ISFIFO(mode) 0
3015 #endif
3016
3017 #ifndef S_ISSOCK
3018 #define S_ISSOCK(mode) 0
3019 #endif
3020
3021         if (active) {
3022                 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
3023                         isc__strerror(errno, strbuf, sizeof(strbuf));
3024                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3025                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3026                                       "isc_socket_cleanunix: stat(%s): %s",
3027                                       sockaddr->type.sunix.sun_path, strbuf);
3028                         return;
3029                 }
3030                 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
3031                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3032                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3033                                       "isc_socket_cleanunix: %s: not a socket",
3034                                       sockaddr->type.sunix.sun_path);
3035                         return;
3036                 }
3037                 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
3038                         isc__strerror(errno, strbuf, sizeof(strbuf));
3039                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3040                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3041                                       "isc_socket_cleanunix: unlink(%s): %s",
3042                                       sockaddr->type.sunix.sun_path, strbuf);
3043                 }
3044                 return;
3045         }
3046
3047         s = socket(AF_UNIX, SOCK_STREAM, 0);
3048         if (s < 0) {
3049                 isc__strerror(errno, strbuf, sizeof(strbuf));
3050                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3051                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
3052                               "isc_socket_cleanunix: socket(%s): %s",
3053                               sockaddr->type.sunix.sun_path, strbuf);
3054                 return;
3055         }
3056
3057         if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
3058                 switch (errno) {
3059                 case ENOENT:    /* We exited cleanly last time */
3060                         break;
3061                 default:
3062                         isc__strerror(errno, strbuf, sizeof(strbuf));
3063                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3064                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
3065                                       "isc_socket_cleanunix: stat(%s): %s",
3066                                       sockaddr->type.sunix.sun_path, strbuf);
3067                         break;
3068                 }
3069                 goto cleanup;
3070         }
3071
3072         if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
3073                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3074                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
3075                               "isc_socket_cleanunix: %s: not a socket",
3076                               sockaddr->type.sunix.sun_path);
3077                 goto cleanup;
3078         }
3079
3080         if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
3081                     sizeof(sockaddr->type.sunix)) < 0) {
3082                 switch (errno) {
3083                 case ECONNREFUSED:
3084                 case ECONNRESET:
3085                         if (unlink(sockaddr->type.sunix.sun_path) < 0) {
3086                                 isc__strerror(errno, strbuf, sizeof(strbuf));
3087                                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3088                                               ISC_LOGMODULE_SOCKET,
3089                                               ISC_LOG_WARNING,
3090                                               "isc_socket_cleanunix: "
3091                                               "unlink(%s): %s",
3092                                               sockaddr->type.sunix.sun_path,
3093                                               strbuf);
3094                         }
3095                         break;
3096                 default:
3097                         isc__strerror(errno, strbuf, sizeof(strbuf));
3098                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3099                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
3100                                       "isc_socket_cleanunix: connect(%s): %s",
3101                                       sockaddr->type.sunix.sun_path, strbuf);
3102                         break;
3103                 }
3104         }
3105  cleanup:
3106         close(s);
3107 #else
3108         UNUSED(sockaddr);
3109         UNUSED(active);
3110 #endif
3111 }
3112
3113 isc_result_t
3114 isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
3115                     isc_uint32_t owner, isc_uint32_t group)
3116 {
3117 #ifdef ISC_PLATFORM_HAVESYSUNH
3118         isc_result_t result = ISC_R_SUCCESS;
3119         char strbuf[ISC_STRERRORSIZE];
3120         char path[sizeof(sockaddr->type.sunix.sun_path)];
3121 #ifdef NEED_SECURE_DIRECTORY
3122         char *slash;
3123 #endif
3124
3125         REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
3126         INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
3127         strcpy(path, sockaddr->type.sunix.sun_path);
3128
3129 #ifdef NEED_SECURE_DIRECTORY
3130         slash = strrchr(path, '/');
3131         if (slash != NULL) {
3132                 if (slash != path)
3133                         *slash = '\0';
3134                 else
3135                         strcpy(path, "/");
3136         } else
3137                 strcpy(path, ".");
3138 #endif
3139         
3140         if (chmod(path, perm) < 0) {
3141                 isc__strerror(errno, strbuf, sizeof(strbuf));
3142                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3143                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3144                               "isc_socket_permunix: chmod(%s, %d): %s",
3145                               path, perm, strbuf);
3146                 result = ISC_R_FAILURE;
3147         }
3148         if (chown(path, owner, group) < 0) {
3149                 isc__strerror(errno, strbuf, sizeof(strbuf));
3150                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3151                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3152                               "isc_socket_permunix: chown(%s, %d, %d): %s",
3153                               path, owner, group,
3154                               strbuf);
3155                 result = ISC_R_FAILURE;
3156         }
3157         return (result);
3158 #else
3159         UNUSED(sockaddr);
3160         UNUSED(perm);
3161         UNUSED(owner);
3162         UNUSED(group);
3163         return (ISC_R_NOTIMPLEMENTED);
3164 #endif
3165 }
3166
3167 isc_result_t
3168 isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr) {
3169         char strbuf[ISC_STRERRORSIZE];
3170         int on = 1;
3171
3172         LOCK(&sock->lock);
3173
3174         INSIST(!sock->bound);
3175
3176         if (sock->pf != sockaddr->type.sa.sa_family) {
3177                 UNLOCK(&sock->lock);
3178                 return (ISC_R_FAMILYMISMATCH);
3179         }
3180         /*
3181          * Only set SO_REUSEADDR when we want a specific port.
3182          */
3183 #ifdef AF_UNIX
3184         if (sock->pf == AF_UNIX)
3185                 goto bind_socket;
3186 #endif
3187         if (isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
3188             setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
3189                        sizeof(on)) < 0) {
3190                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3191                                  "setsockopt(%d) %s", sock->fd,
3192                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3193                                                 ISC_MSG_FAILED, "failed"));
3194                 /* Press on... */
3195         }
3196 #ifdef AF_UNIX
3197  bind_socket:
3198 #endif
3199         if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
3200                 UNLOCK(&sock->lock);
3201                 switch (errno) {
3202                 case EACCES:
3203                         return (ISC_R_NOPERM);
3204                 case EADDRNOTAVAIL:
3205                         return (ISC_R_ADDRNOTAVAIL);
3206                 case EADDRINUSE:
3207                         return (ISC_R_ADDRINUSE);
3208                 case EINVAL:
3209                         return (ISC_R_BOUND);
3210                 default:
3211                         isc__strerror(errno, strbuf, sizeof(strbuf));
3212                         UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
3213                                          strbuf);
3214                         return (ISC_R_UNEXPECTED);
3215                 }
3216         }
3217
3218         socket_log(sock, sockaddr, TRACE,
3219                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
3220         sock->bound = 1;
3221
3222         UNLOCK(&sock->lock);
3223         return (ISC_R_SUCCESS);
3224 }
3225
3226 isc_result_t
3227 isc_socket_filter(isc_socket_t *sock, const char *filter) {
3228 #ifdef SO_ACCEPTFILTER
3229         char strbuf[ISC_STRERRORSIZE];
3230         struct accept_filter_arg afa;
3231 #else
3232         UNUSED(sock);
3233         UNUSED(filter);
3234 #endif
3235
3236         REQUIRE(VALID_SOCKET(sock));
3237
3238 #ifdef SO_ACCEPTFILTER
3239         bzero(&afa, sizeof(afa));
3240         strncpy(afa.af_name, filter, sizeof(afa.af_name));
3241         if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
3242                          &afa, sizeof(afa)) == -1) {
3243                 isc__strerror(errno, strbuf, sizeof(strbuf));
3244                 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
3245                            ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
3246                            strbuf);
3247                 return (ISC_R_FAILURE);
3248         }
3249         return (ISC_R_SUCCESS);
3250 #else
3251         return (ISC_R_NOTIMPLEMENTED);
3252 #endif
3253 }
3254
3255 /*
3256  * Set up to listen on a given socket.  We do this by creating an internal
3257  * event that will be dispatched when the socket has read activity.  The
3258  * watcher will send the internal event to the task when there is a new
3259  * connection.
3260  *
3261  * Unlike in read, we don't preallocate a done event here.  Every time there
3262  * is a new connection we'll have to allocate a new one anyway, so we might
3263  * as well keep things simple rather than having to track them.
3264  */
3265 isc_result_t
3266 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
3267         char strbuf[ISC_STRERRORSIZE];
3268
3269         REQUIRE(VALID_SOCKET(sock));
3270
3271         LOCK(&sock->lock);
3272
3273         REQUIRE(!sock->listener);
3274         REQUIRE(sock->bound);
3275         REQUIRE(sock->type == isc_sockettype_tcp ||
3276                 sock->type == isc_sockettype_unix);
3277
3278         if (backlog == 0)
3279                 backlog = SOMAXCONN;
3280
3281         if (listen(sock->fd, (int)backlog) < 0) {
3282                 UNLOCK(&sock->lock);
3283                 isc__strerror(errno, strbuf, sizeof(strbuf));
3284
3285                 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
3286
3287                 return (ISC_R_UNEXPECTED);
3288         }
3289
3290         sock->listener = 1;
3291
3292         UNLOCK(&sock->lock);
3293         return (ISC_R_SUCCESS);
3294 }
3295
3296 /*
3297  * This should try to do agressive accept() XXXMLG
3298  */
3299 isc_result_t
3300 isc_socket_accept(isc_socket_t *sock,
3301                   isc_task_t *task, isc_taskaction_t action, const void *arg)
3302 {
3303         isc_socket_newconnev_t *dev;
3304         isc_socketmgr_t *manager;
3305         isc_task_t *ntask = NULL;
3306         isc_socket_t *nsock;
3307         isc_result_t result;
3308         isc_boolean_t do_poke = ISC_FALSE;
3309
3310         REQUIRE(VALID_SOCKET(sock));
3311         manager = sock->manager;
3312         REQUIRE(VALID_MANAGER(manager));
3313
3314         LOCK(&sock->lock);
3315
3316         REQUIRE(sock->listener);
3317
3318         /*
3319          * Sender field is overloaded here with the task we will be sending
3320          * this event to.  Just before the actual event is delivered the
3321          * actual ev_sender will be touched up to be the socket.
3322          */
3323         dev = (isc_socket_newconnev_t *)
3324                 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
3325                                    action, arg, sizeof(*dev));
3326         if (dev == NULL) {
3327                 UNLOCK(&sock->lock);
3328                 return (ISC_R_NOMEMORY);
3329         }
3330         ISC_LINK_INIT(dev, ev_link);
3331
3332         result = allocate_socket(manager, sock->type, &nsock);
3333         if (result != ISC_R_SUCCESS) {
3334                 isc_event_free(ISC_EVENT_PTR(&dev));
3335                 UNLOCK(&sock->lock);
3336                 return (result);
3337         }
3338
3339         /*
3340          * Attach to socket and to task.
3341          */
3342         isc_task_attach(task, &ntask);
3343         nsock->references++;
3344
3345         dev->ev_sender = ntask;
3346         dev->newsocket = nsock;
3347
3348         /*
3349          * Poke watcher here.  We still have the socket locked, so there
3350          * is no race condition.  We will keep the lock for such a short
3351          * bit of time waking it up now or later won't matter all that much.
3352          */
3353         if (ISC_LIST_EMPTY(sock->accept_list))
3354                 do_poke = ISC_TRUE;
3355
3356         ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
3357
3358         if (do_poke)
3359                 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
3360
3361         UNLOCK(&sock->lock);
3362         return (ISC_R_SUCCESS);
3363 }
3364
3365 isc_result_t
3366 isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
3367                    isc_task_t *task, isc_taskaction_t action, const void *arg)
3368 {
3369         isc_socket_connev_t *dev;
3370         isc_task_t *ntask = NULL;
3371         isc_socketmgr_t *manager;
3372         int cc;
3373         char strbuf[ISC_STRERRORSIZE];
3374
3375         REQUIRE(VALID_SOCKET(sock));
3376         REQUIRE(addr != NULL);
3377         REQUIRE(task != NULL);
3378         REQUIRE(action != NULL);
3379
3380         manager = sock->manager;
3381         REQUIRE(VALID_MANAGER(manager));
3382         REQUIRE(addr != NULL);
3383
3384         if (isc_sockaddr_ismulticast(addr))
3385                 return (ISC_R_MULTICAST);
3386
3387         LOCK(&sock->lock);
3388
3389         REQUIRE(!sock->connecting);
3390
3391         dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
3392                                                         ISC_SOCKEVENT_CONNECT,
3393                                                         action, arg,
3394                                                         sizeof(*dev));
3395         if (dev == NULL) {
3396                 UNLOCK(&sock->lock);
3397                 return (ISC_R_NOMEMORY);
3398         }
3399         ISC_LINK_INIT(dev, ev_link);
3400
3401         /*
3402          * Try to do the connect right away, as there can be only one
3403          * outstanding, and it might happen to complete.
3404          */
3405         sock->address = *addr;
3406         cc = connect(sock->fd, &addr->type.sa, addr->length);
3407         if (cc < 0) {
3408                 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
3409                         goto queue;
3410
3411                 switch (errno) {
3412 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
3413                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
3414                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
3415                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
3416                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
3417                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
3418 #ifdef EHOSTDOWN
3419                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
3420 #endif
3421                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
3422                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
3423                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
3424                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
3425                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
3426 #undef ERROR_MATCH
3427                 }
3428
3429                 sock->connected = 0;
3430
3431                 isc__strerror(errno, strbuf, sizeof(strbuf));
3432                 UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", errno, strbuf);
3433
3434                 UNLOCK(&sock->lock);
3435                 isc_event_free(ISC_EVENT_PTR(&dev));
3436                 return (ISC_R_UNEXPECTED);
3437
3438         err_exit:
3439                 sock->connected = 0;
3440                 isc_task_send(task, ISC_EVENT_PTR(&dev));
3441
3442                 UNLOCK(&sock->lock);
3443                 return (ISC_R_SUCCESS);
3444         }
3445
3446         /*
3447          * If connect completed, fire off the done event.
3448          */
3449         if (cc == 0) {
3450                 sock->connected = 1;
3451                 sock->bound = 1;
3452                 dev->result = ISC_R_SUCCESS;
3453                 isc_task_send(task, ISC_EVENT_PTR(&dev));
3454
3455                 UNLOCK(&sock->lock);
3456                 return (ISC_R_SUCCESS);
3457         }
3458
3459  queue:
3460
3461         /*
3462          * Attach to task.
3463          */
3464         isc_task_attach(task, &ntask);
3465
3466         sock->connecting = 1;
3467
3468         dev->ev_sender = ntask;
3469
3470         /*
3471          * Poke watcher here.  We still have the socket locked, so there
3472          * is no race condition.  We will keep the lock for such a short
3473          * bit of time waking it up now or later won't matter all that much.
3474          */
3475         if (sock->connect_ev == NULL)
3476                 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
3477
3478         sock->connect_ev = dev;
3479
3480         UNLOCK(&sock->lock);
3481         return (ISC_R_SUCCESS);
3482 }
3483
3484 /*
3485  * Called when a socket with a pending connect() finishes.
3486  */
3487 static void
3488 internal_connect(isc_task_t *me, isc_event_t *ev) {
3489         isc_socket_t *sock;
3490         isc_socket_connev_t *dev;
3491         isc_task_t *task;
3492         int cc;
3493         ISC_SOCKADDR_LEN_T optlen;
3494         char strbuf[ISC_STRERRORSIZE];
3495         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
3496
3497         UNUSED(me);
3498         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3499
3500         sock = ev->ev_sender;
3501         INSIST(VALID_SOCKET(sock));
3502
3503         LOCK(&sock->lock);
3504
3505         /*
3506          * When the internal event was sent the reference count was bumped
3507          * to keep the socket around for us.  Decrement the count here.
3508          */
3509         INSIST(sock->references > 0);
3510         sock->references--;
3511         if (sock->references == 0) {
3512                 UNLOCK(&sock->lock);
3513                 destroy(&sock);
3514                 return;
3515         }
3516
3517         /*
3518          * Has this event been canceled?
3519          */
3520         dev = sock->connect_ev;
3521         if (dev == NULL) {
3522                 INSIST(!sock->connecting);
3523                 UNLOCK(&sock->lock);
3524                 return;
3525         }
3526
3527         INSIST(sock->connecting);
3528         sock->connecting = 0;
3529
3530         /*
3531          * Get any possible error status here.
3532          */
3533         optlen = sizeof(cc);
3534         if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
3535                        (void *)&cc, (void *)&optlen) < 0)
3536                 cc = errno;
3537         else
3538                 errno = cc;
3539
3540         if (errno != 0) {
3541                 /*
3542                  * If the error is EAGAIN, just re-select on this
3543                  * fd and pretend nothing strange happened.
3544                  */
3545                 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
3546                         sock->connecting = 1;
3547                         select_poke(sock->manager, sock->fd,
3548                                     SELECT_POKE_CONNECT);
3549                         UNLOCK(&sock->lock);
3550
3551                         return;
3552                 }
3553
3554                 /*
3555                  * Translate other errors into ISC_R_* flavors.
3556                  */
3557                 switch (errno) {
3558 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
3559                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
3560                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
3561                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
3562                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
3563                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
3564 #ifdef EHOSTDOWN
3565                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
3566 #endif
3567                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
3568                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
3569                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
3570                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
3571                         ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
3572                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
3573 #undef ERROR_MATCH
3574                 default:
3575                         dev->result = ISC_R_UNEXPECTED;
3576                         isc_sockaddr_format(&sock->address, peerbuf,
3577                                             sizeof(peerbuf));
3578                         isc__strerror(errno, strbuf, sizeof(strbuf));
3579                         UNEXPECTED_ERROR(__FILE__, __LINE__,
3580                                          "internal_connect: connect(%s) %s",
3581                                          peerbuf, strbuf);
3582                 }
3583         } else {
3584                 dev->result = ISC_R_SUCCESS;
3585                 sock->connected = 1;
3586                 sock->bound = 1;
3587         }
3588
3589         sock->connect_ev = NULL;
3590
3591         UNLOCK(&sock->lock);
3592
3593         task = dev->ev_sender;
3594         dev->ev_sender = sock;
3595         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
3596 }
3597
3598 isc_result_t
3599 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3600         isc_result_t result;
3601
3602         REQUIRE(VALID_SOCKET(sock));
3603         REQUIRE(addressp != NULL);
3604
3605         LOCK(&sock->lock);
3606
3607         if (sock->connected) {
3608                 *addressp = sock->address;
3609                 result = ISC_R_SUCCESS;
3610         } else {
3611                 result = ISC_R_NOTCONNECTED;
3612         }
3613
3614         UNLOCK(&sock->lock);
3615
3616         return (result);
3617 }
3618
3619 isc_result_t
3620 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3621         ISC_SOCKADDR_LEN_T len;
3622         isc_result_t result;
3623         char strbuf[ISC_STRERRORSIZE];
3624
3625         REQUIRE(VALID_SOCKET(sock));
3626         REQUIRE(addressp != NULL);
3627
3628         LOCK(&sock->lock);
3629
3630         if (!sock->bound) {
3631                 result = ISC_R_NOTBOUND;
3632                 goto out;
3633         }
3634
3635         result = ISC_R_SUCCESS;
3636
3637         len = sizeof(addressp->type);
3638         if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
3639                 isc__strerror(errno, strbuf, sizeof(strbuf));
3640                 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
3641                                  strbuf);
3642                 result = ISC_R_UNEXPECTED;
3643                 goto out;
3644         }
3645         addressp->length = (unsigned int)len;
3646
3647  out:
3648         UNLOCK(&sock->lock);
3649
3650         return (result);
3651 }
3652
3653 /*
3654  * Run through the list of events on this socket, and cancel the ones
3655  * queued for task "task" of type "how".  "how" is a bitmask.
3656  */
3657 void
3658 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
3659
3660         REQUIRE(VALID_SOCKET(sock));
3661
3662         /*
3663          * Quick exit if there is nothing to do.  Don't even bother locking
3664          * in this case.
3665          */
3666         if (how == 0)
3667                 return;
3668
3669         LOCK(&sock->lock);
3670
3671         /*
3672          * All of these do the same thing, more or less.
3673          * Each will:
3674          *      o If the internal event is marked as "posted" try to
3675          *        remove it from the task's queue.  If this fails, mark it
3676          *        as canceled instead, and let the task clean it up later.
3677          *      o For each I/O request for that task of that type, post
3678          *        its done event with status of "ISC_R_CANCELED".
3679          *      o Reset any state needed.
3680          */
3681         if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
3682             && !ISC_LIST_EMPTY(sock->recv_list)) {
3683                 isc_socketevent_t      *dev;
3684                 isc_socketevent_t      *next;
3685                 isc_task_t             *current_task;
3686
3687                 dev = ISC_LIST_HEAD(sock->recv_list);
3688
3689                 while (dev != NULL) {
3690                         current_task = dev->ev_sender;
3691                         next = ISC_LIST_NEXT(dev, ev_link);
3692
3693                         if ((task == NULL) || (task == current_task)) {
3694                                 dev->result = ISC_R_CANCELED;
3695                                 send_recvdone_event(sock, &dev);
3696                         }
3697                         dev = next;
3698                 }
3699         }
3700
3701         if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
3702             && !ISC_LIST_EMPTY(sock->send_list)) {
3703                 isc_socketevent_t      *dev;
3704                 isc_socketevent_t      *next;
3705                 isc_task_t             *current_task;
3706
3707                 dev = ISC_LIST_HEAD(sock->send_list);
3708
3709                 while (dev != NULL) {
3710                         current_task = dev->ev_sender;
3711                         next = ISC_LIST_NEXT(dev, ev_link);
3712
3713                         if ((task == NULL) || (task == current_task)) {
3714                                 dev->result = ISC_R_CANCELED;
3715                                 send_senddone_event(sock, &dev);
3716                         }
3717                         dev = next;
3718                 }
3719         }
3720
3721         if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
3722             && !ISC_LIST_EMPTY(sock->accept_list)) {
3723                 isc_socket_newconnev_t *dev;
3724                 isc_socket_newconnev_t *next;
3725                 isc_task_t             *current_task;
3726
3727                 dev = ISC_LIST_HEAD(sock->accept_list);
3728                 while (dev != NULL) {
3729                         current_task = dev->ev_sender;
3730                         next = ISC_LIST_NEXT(dev, ev_link);
3731
3732                         if ((task == NULL) || (task == current_task)) {
3733
3734                                 ISC_LIST_UNLINK(sock->accept_list, dev,
3735                                                 ev_link);
3736
3737                                 dev->newsocket->references--;
3738                                 free_socket(&dev->newsocket);
3739
3740                                 dev->result = ISC_R_CANCELED;
3741                                 dev->ev_sender = sock;
3742                                 isc_task_sendanddetach(&current_task,
3743                                                        ISC_EVENT_PTR(&dev));
3744                         }
3745
3746                         dev = next;
3747                 }
3748         }
3749
3750         /*
3751          * Connecting is not a list.
3752          */
3753         if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
3754             && sock->connect_ev != NULL) {
3755                 isc_socket_connev_t    *dev;
3756                 isc_task_t             *current_task;
3757
3758                 INSIST(sock->connecting);
3759                 sock->connecting = 0;
3760
3761                 dev = sock->connect_ev;
3762                 current_task = dev->ev_sender;
3763
3764                 if ((task == NULL) || (task == current_task)) {
3765                         sock->connect_ev = NULL;
3766
3767                         dev->result = ISC_R_CANCELED;
3768                         dev->ev_sender = sock;
3769                         isc_task_sendanddetach(&current_task,
3770                                                ISC_EVENT_PTR(&dev));
3771                 }
3772         }
3773
3774         UNLOCK(&sock->lock);
3775 }
3776
3777 isc_sockettype_t
3778 isc_socket_gettype(isc_socket_t *sock) {
3779         REQUIRE(VALID_SOCKET(sock));
3780
3781         return (sock->type);
3782 }
3783
3784 isc_boolean_t
3785 isc_socket_isbound(isc_socket_t *sock) {
3786         isc_boolean_t val;
3787
3788         LOCK(&sock->lock);
3789         val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
3790         UNLOCK(&sock->lock);
3791
3792         return (val);
3793 }
3794
3795 void
3796 isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
3797 #if defined(IPV6_V6ONLY)
3798         int onoff = yes ? 1 : 0;
3799 #else
3800         UNUSED(yes);
3801         UNUSED(sock);
3802 #endif
3803
3804         REQUIRE(VALID_SOCKET(sock));
3805
3806 #ifdef IPV6_V6ONLY
3807         if (sock->pf == AF_INET6) {
3808                 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
3809                                  (void *)&onoff, sizeof(onoff));
3810         }
3811 #endif
3812 }
3813
3814 #ifndef ISC_PLATFORM_USETHREADS
3815 void
3816 isc__socketmgr_getfdsets(fd_set *readset, fd_set *writeset, int *maxfd) {
3817         if (socketmgr == NULL)
3818                 *maxfd = 0;
3819         else {
3820                 *readset = socketmgr->read_fds;
3821                 *writeset = socketmgr->write_fds;
3822                 *maxfd = socketmgr->maxfd + 1;
3823         }
3824 }
3825
3826 isc_result_t
3827 isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd) {
3828         isc_socketmgr_t *manager = socketmgr;
3829
3830         if (manager == NULL)
3831                 return (ISC_R_NOTFOUND);
3832
3833         process_fds(manager, maxfd, readset, writeset);
3834         return (ISC_R_SUCCESS);
3835 }
3836 #endif /* ISC_PLATFORM_USETHREADS */