]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/bind9/lib/isc/unix/socket.c
merge fix for boot-time hang on centos' xen
[FreeBSD/FreeBSD.git] / contrib / bind9 / lib / isc / unix / socket.c
1 /*
2  * Copyright (C) 2004-2008  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1998-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: socket.c,v 1.207.2.19.2.35.4.6 2008/07/29 04:43:57 each Exp $ */
19
20 #include <config.h>
21
22 #include <sys/param.h>
23 #include <sys/types.h>
24 #include <sys/socket.h>
25 #include <sys/time.h>
26 #include <sys/uio.h>
27
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <stddef.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <unistd.h>
34
35 #include <isc/buffer.h>
36 #include <isc/bufferlist.h>
37 #include <isc/condition.h>
38 #include <isc/formatcheck.h>
39 #include <isc/list.h>
40 #include <isc/log.h>
41 #include <isc/mem.h>
42 #include <isc/msgs.h>
43 #include <isc/mutex.h>
44 #include <isc/net.h>
45 #include <isc/once.h>
46 #include <isc/platform.h>
47 #include <isc/print.h>
48 #include <isc/region.h>
49 #include <isc/socket.h>
50 #include <isc/strerror.h>
51 #include <isc/task.h>
52 #include <isc/thread.h>
53 #include <isc/util.h>
54
55 #include "errno2result.h"
56
57 #ifndef ISC_PLATFORM_USETHREADS
58 #include "socket_p.h"
59 #endif /* ISC_PLATFORM_USETHREADS */
60
61 #if defined(SO_BSDCOMPAT) && defined(__linux__)
62 #include <sys/utsname.h>
63 #endif
64
65 /*%
66  * Max number of open sockets.  In the vast majority of cases the default size  
67  * of FD_SETSIZE should be fine, and this constant should be increased only
68  * when absolutely necessary and possible, i.e., the server is exhausting all   
69  * available file descriptors (up to FD_SETSIZE) and the select() function
70  * and FD_xxx macros support larger values than FD_SETSIZE (which may not
71  * always by true, but we keep using some of them to ensure as much
72  * portability as possible).  Note also that overall server performance
73  * may be rather worsened with a larger value of this constant due to
74  * inherent scalability problems of select().
75  *
76  * As a special note, this value shouldn't have to be touched if
77  * this is a build for an authoritative only DNS server.
78  */
79
80 #ifndef ISC_SOCKET_FDSETSIZE
81 #define ISC_SOCKET_FDSETSIZE FD_SETSIZE
82 #endif
83
84 /*%
85  * Mac OS X needs a special definition to support larger values in select()
86  */
87 #if ISC_SOCKET_FDSETSIZE > FD_SETSIZE
88 #ifdef __APPLE__
89 #define _DARWIN_UNLIMITED_SELECT
90 #endif  /* __APPLE__ */
91 #endif
92
93 /*%
94  * Some systems define the socket length argument as an int, some as size_t,
95  * some as socklen_t.  This is here so it can be easily changed if needed.
96  */
97 #ifndef ISC_SOCKADDR_LEN_T
98 #define ISC_SOCKADDR_LEN_T unsigned int
99 #endif
100
101 /*
102  * Define what the possible "soft" errors can be.  These are non-fatal returns
103  * of various network related functions, like recv() and so on.
104  *
105  * For some reason, BSDI (and perhaps others) will sometimes return <0
106  * from recv() but will have errno==0.  This is broken, but we have to
107  * work around it here.
108  */
109 #define SOFT_ERROR(e)   ((e) == EAGAIN || \
110                          (e) == EWOULDBLOCK || \
111                          (e) == EINTR || \
112                          (e) == 0)
113
114 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
115
116 /*
117  * DLVL(90)  --  Function entry/exit and other tracing.
118  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
119  * DLVL(60)  --  Socket data send/receive
120  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
121  * DLVL(20)  --  Socket creation/destruction.
122  */
123 #define TRACE_LEVEL             90
124 #define CORRECTNESS_LEVEL       70
125 #define IOEVENT_LEVEL           60
126 #define EVENT_LEVEL             50
127 #define CREATION_LEVEL          20
128
129 #define TRACE           DLVL(TRACE_LEVEL)
130 #define CORRECTNESS     DLVL(CORRECTNESS_LEVEL)
131 #define IOEVENT         DLVL(IOEVENT_LEVEL)
132 #define EVENT           DLVL(EVENT_LEVEL)
133 #define CREATION        DLVL(CREATION_LEVEL)
134
135 typedef isc_event_t intev_t;
136
137 #define SOCKET_MAGIC            ISC_MAGIC('I', 'O', 'i', 'o')
138 #define VALID_SOCKET(t)         ISC_MAGIC_VALID(t, SOCKET_MAGIC)
139
140 /*
141  * IPv6 control information.  If the socket is an IPv6 socket we want
142  * to collect the destination address and interface so the client can
143  * set them on outgoing packets.
144  */
145 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
146 #ifndef USE_CMSG
147 #define USE_CMSG        1
148 #endif
149 #endif
150
151 /*
152  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
153  * a setsockopt() like interface to request timestamps, and if the OS
154  * doesn't do it for us, call gettimeofday() on every UDP receive?
155  */
156 #ifdef SO_TIMESTAMP
157 #ifndef USE_CMSG
158 #define USE_CMSG        1
159 #endif
160 #endif
161
162 /*
163  * The number of times a send operation is repeated if the result is EINTR.
164  */
165 #define NRETRIES 10
166
167 struct isc_socket {
168         /* Not locked. */
169         unsigned int            magic;
170         isc_socketmgr_t        *manager;
171         isc_mutex_t             lock;
172         isc_sockettype_t        type;
173
174         /* Locked by socket lock. */
175         ISC_LINK(isc_socket_t)  link;
176         unsigned int            references;
177         int                     fd;
178         int                     pf;
179
180         ISC_LIST(isc_socketevent_t)             send_list;
181         ISC_LIST(isc_socketevent_t)             recv_list;
182         ISC_LIST(isc_socket_newconnev_t)        accept_list;
183         isc_socket_connev_t                    *connect_ev;
184
185         /*
186          * Internal events.  Posted when a descriptor is readable or
187          * writable.  These are statically allocated and never freed.
188          * They will be set to non-purgable before use.
189          */
190         intev_t                 readable_ev;
191         intev_t                 writable_ev;
192
193         isc_sockaddr_t          address;  /* remote address */
194
195         unsigned int            pending_recv : 1,
196                                 pending_send : 1,
197                                 pending_accept : 1,
198                                 listener : 1, /* listener socket */
199                                 connected : 1,
200                                 connecting : 1, /* connect pending */
201                                 bound : 1; /* bound to local addr */
202
203 #ifdef ISC_NET_RECVOVERFLOW
204         unsigned char           overflow; /* used for MSG_TRUNC fake */
205 #endif
206
207         char                    *recvcmsgbuf;
208         ISC_SOCKADDR_LEN_T      recvcmsgbuflen;
209         char                    *sendcmsgbuf;
210         ISC_SOCKADDR_LEN_T      sendcmsgbuflen;
211 };
212
213 #define SOCKET_MANAGER_MAGIC    ISC_MAGIC('I', 'O', 'm', 'g')
214 #define VALID_MANAGER(m)        ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
215
216 struct isc_socketmgr {
217         /* Not locked. */
218         unsigned int            magic;
219         isc_mem_t              *mctx;
220         isc_mutex_t             lock;
221         int                     fd_bufsize;
222         int                     fdsize;
223         /* Locked by manager lock. */
224         ISC_LIST(isc_socket_t)  socklist;
225         fd_set                  *read_fds;
226         fd_set                  *read_fds_copy;
227         fd_set                  *write_fds;
228         fd_set                  *write_fds_copy;
229         isc_socket_t           **fds;
230         int                     *fdstate;
231         int                     maxfd;
232         int                     reserved;       /* unlocked */
233 #ifdef ISC_PLATFORM_USETHREADS
234         isc_thread_t            watcher;
235         isc_condition_t         shutdown_ok;
236         int                     pipe_fds[2];
237 #else /* ISC_PLATFORM_USETHREADS */
238         unsigned int            refs;
239 #endif /* ISC_PLATFORM_USETHREADS */
240 };
241
242 #ifndef ISC_PLATFORM_USETHREADS
243 static isc_socketmgr_t *socketmgr = NULL;
244 #endif /* ISC_PLATFORM_USETHREADS */
245
246 #define CLOSED          0       /* this one must be zero */
247 #define MANAGED         1
248 #define CLOSE_PENDING   2
249
250 /*
251  * send() and recv() iovec counts
252  */
253 #define MAXSCATTERGATHER_SEND   (ISC_SOCKET_MAXSCATTERGATHER)
254 #ifdef ISC_NET_RECVOVERFLOW
255 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER + 1)
256 #else
257 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER)
258 #endif
259
260 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
261 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
262 static void free_socket(isc_socket_t **);
263 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
264                                     isc_socket_t **);
265 static void destroy(isc_socket_t **);
266 static void internal_accept(isc_task_t *, isc_event_t *);
267 static void internal_connect(isc_task_t *, isc_event_t *);
268 static void internal_recv(isc_task_t *, isc_event_t *);
269 static void internal_send(isc_task_t *, isc_event_t *);
270 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
271 static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
272                               struct msghdr *, struct iovec *, size_t *);
273 static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
274                               struct msghdr *, struct iovec *, size_t *);
275 static void cleanup_fdsets(isc_socketmgr_t *, isc_mem_t *);
276 static isc_result_t create_fdsets(isc_socketmgr_t *, isc_mem_t *);
277
278 #define SELECT_POKE_SHUTDOWN            (-1)
279 #define SELECT_POKE_NOTHING             (-2)
280 #define SELECT_POKE_READ                (-3)
281 #define SELECT_POKE_ACCEPT              (-3) /* Same as _READ */
282 #define SELECT_POKE_WRITE               (-4)
283 #define SELECT_POKE_CONNECT             (-4) /* Same as _WRITE */
284 #define SELECT_POKE_CLOSE               (-5)
285
286 #define SOCK_DEAD(s)                    ((s)->references == 0)
287
288 static void
289 manager_log(isc_socketmgr_t *sockmgr,
290             isc_logcategory_t *category, isc_logmodule_t *module, int level,
291             const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
292 static void
293 manager_log(isc_socketmgr_t *sockmgr,
294             isc_logcategory_t *category, isc_logmodule_t *module, int level,
295             const char *fmt, ...)
296 {
297         char msgbuf[2048];
298         va_list ap;
299
300         if (! isc_log_wouldlog(isc_lctx, level))
301                 return;
302
303         va_start(ap, fmt);
304         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
305         va_end(ap);
306
307         isc_log_write(isc_lctx, category, module, level,
308                       "sockmgr %p: %s", sockmgr, msgbuf);
309 }
310
311 static void
312 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
313            isc_logcategory_t *category, isc_logmodule_t *module, int level,
314            isc_msgcat_t *msgcat, int msgset, int message,
315            const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
316 static void
317 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
318            isc_logcategory_t *category, isc_logmodule_t *module, int level,
319            isc_msgcat_t *msgcat, int msgset, int message,
320            const char *fmt, ...)
321 {
322         char msgbuf[2048];
323         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
324         va_list ap;
325
326         if (! isc_log_wouldlog(isc_lctx, level))
327                 return;
328
329         va_start(ap, fmt);
330         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
331         va_end(ap);
332
333         if (address == NULL) {
334                 isc_log_iwrite(isc_lctx, category, module, level,
335                                msgcat, msgset, message,
336                                "socket %p: %s", sock, msgbuf);
337         } else {
338                 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
339                 isc_log_iwrite(isc_lctx, category, module, level,
340                                msgcat, msgset, message,
341                                "socket %p %s: %s", sock, peerbuf, msgbuf);
342         }
343 }
344
345 static void
346 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
347         isc_socket_t *sock;
348
349         /*
350          * This is a wakeup on a socket.  If the socket is not in the
351          * process of being closed, start watching it for either reads
352          * or writes.
353          */
354
355         INSIST(fd >= 0 && fd < manager->fdsize);
356
357         if (manager->fdstate[fd] == CLOSE_PENDING) {
358                 manager->fdstate[fd] = CLOSED;
359                 FD_CLR(fd, manager->read_fds);
360                 FD_CLR(fd, manager->write_fds);
361                 (void)close(fd);
362                 return;
363         }
364         if (manager->fdstate[fd] != MANAGED)
365                 return;
366
367         sock = manager->fds[fd];
368
369         /*
370          * Set requested bit.
371          */
372         if (msg == SELECT_POKE_READ)
373                 FD_SET(sock->fd, manager->read_fds);
374         if (msg == SELECT_POKE_WRITE)
375                 FD_SET(sock->fd, manager->write_fds);
376 }
377
378 #ifdef ISC_PLATFORM_USETHREADS
379 /*
380  * Poke the select loop when there is something for us to do.
381  * The write is required (by POSIX) to complete.  That is, we
382  * will not get partial writes.
383  */
384 static void
385 select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
386         int cc;
387         int buf[2];
388         char strbuf[ISC_STRERRORSIZE];
389
390         buf[0] = fd;
391         buf[1] = msg;
392
393         do {
394                 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
395 #ifdef ENOSR
396                 /*
397                  * Treat ENOSR as EAGAIN but loop slowly as it is
398                  * unlikely to clear fast.
399                  */
400                 if (cc < 0 && errno == ENOSR) {
401                         sleep(1);
402                         errno = EAGAIN;
403                 }
404 #endif
405         } while (cc < 0 && SOFT_ERROR(errno));
406
407         if (cc < 0) {
408                 isc__strerror(errno, strbuf, sizeof(strbuf));
409                 FATAL_ERROR(__FILE__, __LINE__,
410                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
411                                            ISC_MSG_WRITEFAILED,
412                                            "write() failed "
413                                            "during watcher poke: %s"),
414                             strbuf);
415         }
416
417         INSIST(cc == sizeof(buf));
418 }
419
420 /*
421  * Read a message on the internal fd.
422  */
423 static void
424 select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
425         int buf[2];
426         int cc;
427         char strbuf[ISC_STRERRORSIZE];
428
429         cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
430         if (cc < 0) {
431                 *msg = SELECT_POKE_NOTHING;
432                 *fd = -1;       /* Silence compiler. */
433                 if (SOFT_ERROR(errno))
434                         return;
435
436                 isc__strerror(errno, strbuf, sizeof(strbuf));
437                 FATAL_ERROR(__FILE__, __LINE__,
438                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
439                                            ISC_MSG_READFAILED,
440                                            "read() failed "
441                                            "during watcher poke: %s"),
442                             strbuf);
443
444                 return;
445         }
446         INSIST(cc == sizeof(buf));
447
448         *fd = buf[0];
449         *msg = buf[1];
450 }
451 #else /* ISC_PLATFORM_USETHREADS */
452 /*
453  * Update the state of the socketmgr when something changes.
454  */
455 static void
456 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
457         if (msg == SELECT_POKE_SHUTDOWN)
458                 return;
459         else if (fd >= 0)
460                 wakeup_socket(manager, fd, msg);
461         return;
462 }
463 #endif /* ISC_PLATFORM_USETHREADS */
464
465 /*
466  * Make a fd non-blocking.
467  */
468 static isc_result_t
469 make_nonblock(int fd) {
470         int ret;
471         int flags;
472         char strbuf[ISC_STRERRORSIZE];
473 #ifdef USE_FIONBIO_IOCTL
474         int on = 1;
475
476         ret = ioctl(fd, FIONBIO, (char *)&on);
477 #else
478         flags = fcntl(fd, F_GETFL, 0);
479         flags |= PORT_NONBLOCK;
480         ret = fcntl(fd, F_SETFL, flags);
481 #endif
482
483         if (ret == -1) {
484                 isc__strerror(errno, strbuf, sizeof(strbuf));
485                 UNEXPECTED_ERROR(__FILE__, __LINE__,
486 #ifdef USE_FIONBIO_IOCTL
487                                  "ioctl(%d, FIONBIO, &on): %s", fd,
488 #else
489                                  "fcntl(%d, F_SETFL, %d): %s", fd, flags,
490 #endif
491                                  strbuf);
492
493                 return (ISC_R_UNEXPECTED);
494         }
495
496         return (ISC_R_SUCCESS);
497 }
498
499 #ifdef USE_CMSG
500 /*
501  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
502  * In order to ensure as much portability as possible, we provide wrapper
503  * functions of these macros.
504  * Note that cmsg_space() could run slow on OSes that do not have
505  * CMSG_SPACE.
506  */
507 static inline ISC_SOCKADDR_LEN_T
508 cmsg_len(ISC_SOCKADDR_LEN_T len) {
509 #ifdef CMSG_LEN
510         return (CMSG_LEN(len));
511 #else
512         ISC_SOCKADDR_LEN_T hdrlen;
513
514         /*
515          * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
516          * is correct.
517          */
518         hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
519         return (hdrlen + len);
520 #endif
521 }
522
523 static inline ISC_SOCKADDR_LEN_T
524 cmsg_space(ISC_SOCKADDR_LEN_T len) {
525 #ifdef CMSG_SPACE
526         return (CMSG_SPACE(len));
527 #else
528         struct msghdr msg;
529         struct cmsghdr *cmsgp;
530         /*
531          * XXX: The buffer length is an ad-hoc value, but should be enough
532          * in a practical sense.
533          */
534         char dummybuf[sizeof(struct cmsghdr) + 1024];
535
536         memset(&msg, 0, sizeof(msg));
537         msg.msg_control = dummybuf;
538         msg.msg_controllen = sizeof(dummybuf);
539
540         cmsgp = (struct cmsghdr *)dummybuf;
541         cmsgp->cmsg_len = cmsg_len(len);
542
543         cmsgp = CMSG_NXTHDR(&msg, cmsgp);
544         if (cmsgp != NULL)
545                 return ((char *)cmsgp - (char *)msg.msg_control);
546         else
547                 return (0);
548 #endif
549 }
550 #endif /* USE_CMSG */
551
552 /*
553  * Process control messages received on a socket.
554  */
555 static void
556 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
557 #ifdef USE_CMSG
558         struct cmsghdr *cmsgp;
559 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
560         struct in6_pktinfo *pktinfop;
561 #endif
562 #ifdef SO_TIMESTAMP
563         struct timeval *timevalp;
564 #endif
565 #endif
566
567         /*
568          * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
569          * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
570          * They are all here, outside of the CPP tests, because it is
571          * more consistent with the usual ISC coding style.
572          */
573         UNUSED(sock);
574         UNUSED(msg);
575         UNUSED(dev);
576
577 #ifdef ISC_NET_BSD44MSGHDR
578
579 #ifdef MSG_TRUNC
580         if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
581                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
582 #endif
583
584 #ifdef MSG_CTRUNC
585         if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
586                 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
587 #endif
588
589 #ifndef USE_CMSG
590         return;
591 #else
592         if (msg->msg_controllen == 0U || msg->msg_control == NULL)
593                 return;
594
595 #ifdef SO_TIMESTAMP
596         timevalp = NULL;
597 #endif
598 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
599         pktinfop = NULL;
600 #endif
601
602         cmsgp = CMSG_FIRSTHDR(msg);
603         while (cmsgp != NULL) {
604                 socket_log(sock, NULL, TRACE,
605                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
606                            "processing cmsg %p", cmsgp);
607
608 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
609                 if (cmsgp->cmsg_level == IPPROTO_IPV6
610                     && cmsgp->cmsg_type == IPV6_PKTINFO) {
611
612                         pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
613                         memcpy(&dev->pktinfo, pktinfop,
614                                sizeof(struct in6_pktinfo));
615                         dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
616                         socket_log(sock, NULL, TRACE,
617                                    isc_msgcat, ISC_MSGSET_SOCKET,
618                                    ISC_MSG_IFRECEIVED,
619                                    "interface received on ifindex %u",
620                                    dev->pktinfo.ipi6_ifindex);
621                         if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
622                                 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
623                         goto next;
624                 }
625 #endif
626
627 #ifdef SO_TIMESTAMP
628                 if (cmsgp->cmsg_level == SOL_SOCKET
629                     && cmsgp->cmsg_type == SCM_TIMESTAMP) {
630                         timevalp = (struct timeval *)CMSG_DATA(cmsgp);
631                         dev->timestamp.seconds = timevalp->tv_sec;
632                         dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
633                         dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
634                         goto next;
635                 }
636 #endif
637
638         next:
639                 cmsgp = CMSG_NXTHDR(msg, cmsgp);
640         }
641 #endif /* USE_CMSG */
642
643 #endif /* ISC_NET_BSD44MSGHDR */
644 }
645
646 /*
647  * Construct an iov array and attach it to the msghdr passed in.  This is
648  * the SEND constructor, which will use the used region of the buffer
649  * (if using a buffer list) or will use the internal region (if a single
650  * buffer I/O is requested).
651  *
652  * Nothing can be NULL, and the done event must list at least one buffer
653  * on the buffer linked list for this function to be meaningful.
654  *
655  * If write_countp != NULL, *write_countp will hold the number of bytes
656  * this transaction can send.
657  */
658 static void
659 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
660                   struct msghdr *msg, struct iovec *iov, size_t *write_countp)
661 {
662         unsigned int iovcount;
663         isc_buffer_t *buffer;
664         isc_region_t used;
665         size_t write_count;
666         size_t skip_count;
667
668         memset(msg, 0, sizeof(*msg));
669
670         if (sock->type == isc_sockettype_udp) {
671                 msg->msg_name = (void *)&dev->address.type.sa;
672                 msg->msg_namelen = dev->address.length;
673         } else {
674                 msg->msg_name = NULL;
675                 msg->msg_namelen = 0;
676         }
677
678         buffer = ISC_LIST_HEAD(dev->bufferlist);
679         write_count = 0;
680         iovcount = 0;
681
682         /*
683          * Single buffer I/O?  Skip what we've done so far in this region.
684          */
685         if (buffer == NULL) {
686                 write_count = dev->region.length - dev->n;
687                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
688                 iov[0].iov_len = write_count;
689                 iovcount = 1;
690
691                 goto config;
692         }
693
694         /*
695          * Multibuffer I/O.
696          * Skip the data in the buffer list that we have already written.
697          */
698         skip_count = dev->n;
699         while (buffer != NULL) {
700                 REQUIRE(ISC_BUFFER_VALID(buffer));
701                 if (skip_count < isc_buffer_usedlength(buffer))
702                         break;
703                 skip_count -= isc_buffer_usedlength(buffer);
704                 buffer = ISC_LIST_NEXT(buffer, link);
705         }
706
707         while (buffer != NULL) {
708                 INSIST(iovcount < MAXSCATTERGATHER_SEND);
709
710                 isc_buffer_usedregion(buffer, &used);
711
712                 if (used.length > 0) {
713                         iov[iovcount].iov_base = (void *)(used.base
714                                                           + skip_count);
715                         iov[iovcount].iov_len = used.length - skip_count;
716                         write_count += (used.length - skip_count);
717                         skip_count = 0;
718                         iovcount++;
719                 }
720                 buffer = ISC_LIST_NEXT(buffer, link);
721         }
722
723         INSIST(skip_count == 0U);
724
725  config:
726         msg->msg_iov = iov;
727         msg->msg_iovlen = iovcount;
728
729 #ifdef ISC_NET_BSD44MSGHDR
730         msg->msg_control = NULL;
731         msg->msg_controllen = 0;
732         msg->msg_flags = 0;
733 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
734         if ((sock->type == isc_sockettype_udp)
735             && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
736                 struct cmsghdr *cmsgp;
737                 struct in6_pktinfo *pktinfop;
738
739                 socket_log(sock, NULL, TRACE,
740                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
741                            "sendto pktinfo data, ifindex %u",
742                            dev->pktinfo.ipi6_ifindex);
743
744                 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
745                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
746                 msg->msg_control = (void *)sock->sendcmsgbuf;
747
748                 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
749                 cmsgp->cmsg_level = IPPROTO_IPV6;
750                 cmsgp->cmsg_type = IPV6_PKTINFO;
751                 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
752                 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
753                 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
754         }
755 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
756 #else /* ISC_NET_BSD44MSGHDR */
757         msg->msg_accrights = NULL;
758         msg->msg_accrightslen = 0;
759 #endif /* ISC_NET_BSD44MSGHDR */
760
761         if (write_countp != NULL)
762                 *write_countp = write_count;
763 }
764
765 /*
766  * Construct an iov array and attach it to the msghdr passed in.  This is
767  * the RECV constructor, which will use the avialable region of the buffer
768  * (if using a buffer list) or will use the internal region (if a single
769  * buffer I/O is requested).
770  *
771  * Nothing can be NULL, and the done event must list at least one buffer
772  * on the buffer linked list for this function to be meaningful.
773  *
774  * If read_countp != NULL, *read_countp will hold the number of bytes
775  * this transaction can receive.
776  */
777 static void
778 build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
779                   struct msghdr *msg, struct iovec *iov, size_t *read_countp)
780 {
781         unsigned int iovcount;
782         isc_buffer_t *buffer;
783         isc_region_t available;
784         size_t read_count;
785
786         memset(msg, 0, sizeof(struct msghdr));
787
788         if (sock->type == isc_sockettype_udp) {
789                 memset(&dev->address, 0, sizeof(dev->address));
790 #ifdef BROKEN_RECVMSG
791                 if (sock->pf == AF_INET) {
792                         msg->msg_name = (void *)&dev->address.type.sin;
793                         msg->msg_namelen = sizeof(dev->address.type.sin6);
794                 } else if (sock->pf == AF_INET6) {
795                         msg->msg_name = (void *)&dev->address.type.sin6;
796                         msg->msg_namelen = sizeof(dev->address.type.sin6);
797 #ifdef ISC_PLATFORM_HAVESYSUNH
798                 } else if (sock->pf == AF_UNIX) {
799                         msg->msg_name = (void *)&dev->address.type.sunix;
800                         msg->msg_namelen = sizeof(dev->address.type.sunix);
801 #endif
802                 } else {
803                         msg->msg_name = (void *)&dev->address.type.sa;
804                         msg->msg_namelen = sizeof(dev->address.type);
805                 }
806 #else
807                 msg->msg_name = (void *)&dev->address.type.sa;
808                 msg->msg_namelen = sizeof(dev->address.type);
809 #endif
810 #ifdef ISC_NET_RECVOVERFLOW
811                 /* If needed, steal one iovec for overflow detection. */
812                 maxiov--;
813 #endif
814         } else { /* TCP */
815                 msg->msg_name = NULL;
816                 msg->msg_namelen = 0;
817                 dev->address = sock->address;
818         }
819
820         buffer = ISC_LIST_HEAD(dev->bufferlist);
821         read_count = 0;
822
823         /*
824          * Single buffer I/O?  Skip what we've done so far in this region.
825          */
826         if (buffer == NULL) {
827                 read_count = dev->region.length - dev->n;
828                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
829                 iov[0].iov_len = read_count;
830                 iovcount = 1;
831
832                 goto config;
833         }
834
835         /*
836          * Multibuffer I/O.
837          * Skip empty buffers.
838          */
839         while (buffer != NULL) {
840                 REQUIRE(ISC_BUFFER_VALID(buffer));
841                 if (isc_buffer_availablelength(buffer) != 0)
842                         break;
843                 buffer = ISC_LIST_NEXT(buffer, link);
844         }
845
846         iovcount = 0;
847         while (buffer != NULL) {
848                 INSIST(iovcount < MAXSCATTERGATHER_RECV);
849
850                 isc_buffer_availableregion(buffer, &available);
851
852                 if (available.length > 0) {
853                         iov[iovcount].iov_base = (void *)(available.base);
854                         iov[iovcount].iov_len = available.length;
855                         read_count += available.length;
856                         iovcount++;
857                 }
858                 buffer = ISC_LIST_NEXT(buffer, link);
859         }
860
861  config:
862
863         /*
864          * If needed, set up to receive that one extra byte.  Note that
865          * we know there is at least one iov left, since we stole it
866          * at the top of this function.
867          */
868 #ifdef ISC_NET_RECVOVERFLOW
869         if (sock->type == isc_sockettype_udp) {
870                 iov[iovcount].iov_base = (void *)(&sock->overflow);
871                 iov[iovcount].iov_len = 1;
872                 iovcount++;
873         }
874 #endif
875
876         msg->msg_iov = iov;
877         msg->msg_iovlen = iovcount;
878
879 #ifdef ISC_NET_BSD44MSGHDR
880         msg->msg_control = NULL;
881         msg->msg_controllen = 0;
882         msg->msg_flags = 0;
883 #if defined(USE_CMSG)
884         if (sock->type == isc_sockettype_udp) {
885                 msg->msg_control = sock->recvcmsgbuf;
886                 msg->msg_controllen = sock->recvcmsgbuflen;
887         }
888 #endif /* USE_CMSG */
889 #else /* ISC_NET_BSD44MSGHDR */
890         msg->msg_accrights = NULL;
891         msg->msg_accrightslen = 0;
892 #endif /* ISC_NET_BSD44MSGHDR */
893
894         if (read_countp != NULL)
895                 *read_countp = read_count;
896 }
897
898 static void
899 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
900                 isc_socketevent_t *dev)
901 {
902         if (sock->type == isc_sockettype_udp) {
903                 if (address != NULL)
904                         dev->address = *address;
905                 else
906                         dev->address = sock->address;
907         } else if (sock->type == isc_sockettype_tcp) {
908                 INSIST(address == NULL);
909                 dev->address = sock->address;
910         }
911 }
912
913 static isc_socketevent_t *
914 allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
915                      isc_taskaction_t action, const void *arg)
916 {
917         isc_socketevent_t *ev;
918
919         ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
920                                                      sock, eventtype,
921                                                      action, arg,
922                                                      sizeof(*ev));
923
924         if (ev == NULL)
925                 return (NULL);
926
927         ev->result = ISC_R_UNEXPECTED;
928         ISC_LINK_INIT(ev, ev_link);
929         ISC_LIST_INIT(ev->bufferlist);
930         ev->region.base = NULL;
931         ev->n = 0;
932         ev->offset = 0;
933         ev->attributes = 0;
934
935         return (ev);
936 }
937
938 #if defined(ISC_SOCKET_DEBUG)
939 static void
940 dump_msg(struct msghdr *msg) {
941         unsigned int i;
942
943         printf("MSGHDR %p\n", msg);
944         printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
945         printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
946         for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
947                 printf("\t\t%d\tbase %p, len %d\n", i,
948                        msg->msg_iov[i].iov_base,
949                        msg->msg_iov[i].iov_len);
950 #ifdef ISC_NET_BSD44MSGHDR
951         printf("\tcontrol %p, controllen %d\n", msg->msg_control,
952                msg->msg_controllen);
953 #endif
954 }
955 #endif
956
957 #define DOIO_SUCCESS            0       /* i/o ok, event sent */
958 #define DOIO_SOFT               1       /* i/o ok, soft error, no event sent */
959 #define DOIO_HARD               2       /* i/o error, event sent */
960 #define DOIO_EOF                3       /* EOF, no event sent */
961
962 static int
963 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
964         int cc;
965         struct iovec iov[MAXSCATTERGATHER_RECV];
966         size_t read_count;
967         size_t actual_count;
968         struct msghdr msghdr;
969         isc_buffer_t *buffer;
970         int recv_errno;
971         char strbuf[ISC_STRERRORSIZE];
972
973         build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
974
975 #if defined(ISC_SOCKET_DEBUG)
976         dump_msg(&msghdr);
977 #endif
978
979         cc = recvmsg(sock->fd, &msghdr, 0);
980         recv_errno = errno;
981
982 #if defined(ISC_SOCKET_DEBUG)
983         dump_msg(&msghdr);
984 #endif
985
986         if (cc < 0) {
987                 if (SOFT_ERROR(recv_errno))
988                         return (DOIO_SOFT);
989
990                 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
991                         isc__strerror(recv_errno, strbuf, sizeof(strbuf));
992                         socket_log(sock, NULL, IOEVENT,
993                                    isc_msgcat, ISC_MSGSET_SOCKET,
994                                    ISC_MSG_DOIORECV,
995                                   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
996                                    sock->fd, cc, recv_errno, strbuf);
997                 }
998
999 #define SOFT_OR_HARD(_system, _isc) \
1000         if (recv_errno == _system) { \
1001                 if (sock->connected) { \
1002                         dev->result = _isc; \
1003                         return (DOIO_HARD); \
1004                 } \
1005                 return (DOIO_SOFT); \
1006         }
1007 #define ALWAYS_HARD(_system, _isc) \
1008         if (recv_errno == _system) { \
1009                 dev->result = _isc; \
1010                 return (DOIO_HARD); \
1011         }
1012
1013                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1014                 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1015                 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1016                 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1017                 /* HPUX 11.11 can return EADDRNOTAVAIL. */
1018                 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1019                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1020
1021 #undef SOFT_OR_HARD
1022 #undef ALWAYS_HARD
1023
1024                 dev->result = isc__errno2result(recv_errno);
1025                 return (DOIO_HARD);
1026         }
1027
1028         /*
1029          * On TCP, zero length reads indicate EOF, while on
1030          * UDP, zero length reads are perfectly valid, although
1031          * strange.
1032          */
1033         if ((sock->type == isc_sockettype_tcp) && (cc == 0))
1034                 return (DOIO_EOF);
1035
1036         if (sock->type == isc_sockettype_udp) {
1037                 dev->address.length = msghdr.msg_namelen;
1038                 if (isc_sockaddr_getport(&dev->address) == 0) {
1039                         if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1040                                 socket_log(sock, &dev->address, IOEVENT,
1041                                            isc_msgcat, ISC_MSGSET_SOCKET,
1042                                            ISC_MSG_ZEROPORT,
1043                                            "dropping source port zero packet");
1044                         }
1045                         return (DOIO_SOFT);
1046                 }
1047         }
1048
1049         socket_log(sock, &dev->address, IOEVENT,
1050                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1051                    "packet received correctly");
1052
1053         /*
1054          * Overflow bit detection.  If we received MORE bytes than we should,
1055          * this indicates an overflow situation.  Set the flag in the
1056          * dev entry and adjust how much we read by one.
1057          */
1058 #ifdef ISC_NET_RECVOVERFLOW
1059         if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1060                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1061                 cc--;
1062         }
1063 #endif
1064
1065         /*
1066          * If there are control messages attached, run through them and pull
1067          * out the interesting bits.
1068          */
1069         if (sock->type == isc_sockettype_udp)
1070                 process_cmsg(sock, &msghdr, dev);
1071
1072         /*
1073          * update the buffers (if any) and the i/o count
1074          */
1075         dev->n += cc;
1076         actual_count = cc;
1077         buffer = ISC_LIST_HEAD(dev->bufferlist);
1078         while (buffer != NULL && actual_count > 0U) {
1079                 REQUIRE(ISC_BUFFER_VALID(buffer));
1080                 if (isc_buffer_availablelength(buffer) <= actual_count) {
1081                         actual_count -= isc_buffer_availablelength(buffer);
1082                         isc_buffer_add(buffer,
1083                                        isc_buffer_availablelength(buffer));
1084                 } else {
1085                         isc_buffer_add(buffer, actual_count);
1086                         actual_count = 0;
1087                         break;
1088                 }
1089                 buffer = ISC_LIST_NEXT(buffer, link);
1090                 if (buffer == NULL) {
1091                         INSIST(actual_count == 0U);
1092                 }
1093         }
1094
1095         /*
1096          * If we read less than we expected, update counters,
1097          * and let the upper layer poke the descriptor.
1098          */
1099         if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1100                 return (DOIO_SOFT);
1101
1102         /*
1103          * Full reads are posted, or partials if partials are ok.
1104          */
1105         dev->result = ISC_R_SUCCESS;
1106         return (DOIO_SUCCESS);
1107 }
1108
1109 /*
1110  * Returns:
1111  *      DOIO_SUCCESS    The operation succeeded.  dev->result contains
1112  *                      ISC_R_SUCCESS.
1113  *
1114  *      DOIO_HARD       A hard or unexpected I/O error was encountered.
1115  *                      dev->result contains the appropriate error.
1116  *
1117  *      DOIO_SOFT       A soft I/O error was encountered.  No senddone
1118  *                      event was sent.  The operation should be retried.
1119  *
1120  *      No other return values are possible.
1121  */
1122 static int
1123 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1124         int cc;
1125         struct iovec iov[MAXSCATTERGATHER_SEND];
1126         size_t write_count;
1127         struct msghdr msghdr;
1128         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1129         int attempts = 0;
1130         int send_errno;
1131         char strbuf[ISC_STRERRORSIZE];
1132
1133         build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1134
1135  resend:
1136         cc = sendmsg(sock->fd, &msghdr, 0);
1137         send_errno = errno;
1138
1139         /*
1140          * Check for error or block condition.
1141          */
1142         if (cc < 0) {
1143                 if (send_errno == EINTR && ++attempts < NRETRIES)
1144                         goto resend;
1145
1146                 if (SOFT_ERROR(send_errno))
1147                         return (DOIO_SOFT);
1148
1149 #define SOFT_OR_HARD(_system, _isc) \
1150         if (send_errno == _system) { \
1151                 if (sock->connected) { \
1152                         dev->result = _isc; \
1153                         return (DOIO_HARD); \
1154                 } \
1155                 return (DOIO_SOFT); \
1156         }
1157 #define ALWAYS_HARD(_system, _isc) \
1158         if (send_errno == _system) { \
1159                 dev->result = _isc; \
1160                 return (DOIO_HARD); \
1161         }
1162
1163                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1164                 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1165                 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1166                 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1167                 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1168 #ifdef EHOSTDOWN
1169                 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1170 #endif
1171                 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1172                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1173                 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1174                 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1175                 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1176
1177 #undef SOFT_OR_HARD
1178 #undef ALWAYS_HARD
1179
1180                 /*
1181                  * The other error types depend on whether or not the
1182                  * socket is UDP or TCP.  If it is UDP, some errors
1183                  * that we expect to be fatal under TCP are merely
1184                  * annoying, and are really soft errors.
1185                  *
1186                  * However, these soft errors are still returned as
1187                  * a status.
1188                  */
1189                 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1190                 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1191                 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1192                                  addrbuf, strbuf);
1193                 dev->result = isc__errno2result(send_errno);
1194                 return (DOIO_HARD);
1195         }
1196
1197         if (cc == 0)
1198                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1199                                  "internal_send: send() %s 0",
1200                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1201                                                 ISC_MSG_RETURNED, "returned"));
1202
1203         /*
1204          * If we write less than we expected, update counters, poke.
1205          */
1206         dev->n += cc;
1207         if ((size_t)cc != write_count)
1208                 return (DOIO_SOFT);
1209
1210         /*
1211          * Exactly what we wanted to write.  We're done with this
1212          * entry.  Post its completion event.
1213          */
1214         dev->result = ISC_R_SUCCESS;
1215         return (DOIO_SUCCESS);
1216 }
1217
1218 /*
1219  * Kill.
1220  *
1221  * Caller must ensure that the socket is not locked and no external
1222  * references exist.
1223  */
1224 static void
1225 destroy(isc_socket_t **sockp) {
1226         isc_socket_t *sock = *sockp;
1227         isc_socketmgr_t *manager = sock->manager;
1228
1229         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1230                    ISC_MSG_DESTROYING, "destroying");
1231
1232         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1233         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1234         INSIST(ISC_LIST_EMPTY(sock->send_list));
1235         INSIST(sock->connect_ev == NULL);
1236         REQUIRE(sock->fd >= 0 && sock->fd < (int)manager->fdsize);
1237
1238         LOCK(&manager->lock);
1239
1240         /*
1241          * No one has this socket open, so the watcher doesn't have to be
1242          * poked, and the socket doesn't have to be locked.
1243          */
1244         manager->fds[sock->fd] = NULL;
1245         manager->fdstate[sock->fd] = CLOSE_PENDING;
1246         select_poke(manager, sock->fd, SELECT_POKE_CLOSE);
1247         ISC_LIST_UNLINK(manager->socklist, sock, link);
1248
1249 #ifdef ISC_PLATFORM_USETHREADS
1250         if (ISC_LIST_EMPTY(manager->socklist))
1251                 SIGNAL(&manager->shutdown_ok);
1252 #endif /* ISC_PLATFORM_USETHREADS */
1253
1254         /*
1255          * XXX should reset manager->maxfd here
1256          */
1257
1258         UNLOCK(&manager->lock);
1259
1260         free_socket(sockp);
1261 }
1262
1263 static isc_result_t
1264 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1265                 isc_socket_t **socketp)
1266 {
1267         isc_socket_t *sock;
1268         isc_result_t ret;
1269         ISC_SOCKADDR_LEN_T cmsgbuflen;
1270
1271         sock = isc_mem_get(manager->mctx, sizeof(*sock));
1272
1273         if (sock == NULL)
1274                 return (ISC_R_NOMEMORY);
1275
1276         ret = ISC_R_UNEXPECTED;
1277
1278         sock->magic = 0;
1279         sock->references = 0;
1280
1281         sock->manager = manager;
1282         sock->type = type;
1283         sock->fd = -1;
1284
1285         ISC_LINK_INIT(sock, link);
1286
1287         sock->recvcmsgbuf = NULL;
1288         sock->sendcmsgbuf = NULL;
1289
1290         /*
1291          * set up cmsg buffers
1292          */
1293         cmsgbuflen = 0;
1294 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1295         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1296 #endif
1297 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
1298         cmsgbuflen += cmsg_space(sizeof(struct timeval));
1299 #endif
1300         sock->recvcmsgbuflen = cmsgbuflen;
1301         if (sock->recvcmsgbuflen != 0U) {
1302                 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1303                 if (sock->recvcmsgbuf == NULL)
1304                         goto error;
1305         }
1306
1307         cmsgbuflen = 0;
1308 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1309         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1310 #endif
1311         sock->sendcmsgbuflen = cmsgbuflen;
1312         if (sock->sendcmsgbuflen != 0U) {
1313                 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1314                 if (sock->sendcmsgbuf == NULL)
1315                         goto error;
1316         }
1317
1318         /*
1319          * set up list of readers and writers to be initially empty
1320          */
1321         ISC_LIST_INIT(sock->recv_list);
1322         ISC_LIST_INIT(sock->send_list);
1323         ISC_LIST_INIT(sock->accept_list);
1324         sock->connect_ev = NULL;
1325         sock->pending_recv = 0;
1326         sock->pending_send = 0;
1327         sock->pending_accept = 0;
1328         sock->listener = 0;
1329         sock->connected = 0;
1330         sock->connecting = 0;
1331         sock->bound = 0;
1332
1333         /*
1334          * initialize the lock
1335          */
1336         if (isc_mutex_init(&sock->lock) != ISC_R_SUCCESS) {
1337                 sock->magic = 0;
1338                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1339                                  "isc_mutex_init() %s",
1340                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1341                                                 ISC_MSG_FAILED, "failed"));
1342                 ret = ISC_R_UNEXPECTED;
1343                 goto error;
1344         }
1345
1346         /*
1347          * Initialize readable and writable events
1348          */
1349         ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1350                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1351                        NULL, sock, sock, NULL, NULL);
1352         ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1353                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1354                        NULL, sock, sock, NULL, NULL);
1355
1356         sock->magic = SOCKET_MAGIC;
1357         *socketp = sock;
1358
1359         return (ISC_R_SUCCESS);
1360
1361  error:
1362         if (sock->recvcmsgbuf != NULL)
1363                 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
1364                             sock->recvcmsgbuflen);
1365         if (sock->sendcmsgbuf != NULL)
1366                 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
1367                             sock->sendcmsgbuflen);
1368         isc_mem_put(manager->mctx, sock, sizeof(*sock));
1369
1370         return (ret);
1371 }
1372
1373 /*
1374  * This event requires that the various lists be empty, that the reference
1375  * count be 1, and that the magic number is valid.  The other socket bits,
1376  * like the lock, must be initialized as well.  The fd associated must be
1377  * marked as closed, by setting it to -1 on close, or this routine will
1378  * also close the socket.
1379  */
1380 static void
1381 free_socket(isc_socket_t **socketp) {
1382         isc_socket_t *sock = *socketp;
1383
1384         INSIST(sock->references == 0);
1385         INSIST(VALID_SOCKET(sock));
1386         INSIST(!sock->connecting);
1387         INSIST(!sock->pending_recv);
1388         INSIST(!sock->pending_send);
1389         INSIST(!sock->pending_accept);
1390         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1391         INSIST(ISC_LIST_EMPTY(sock->send_list));
1392         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1393         INSIST(!ISC_LINK_LINKED(sock, link));
1394
1395         if (sock->recvcmsgbuf != NULL)
1396                 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
1397                             sock->recvcmsgbuflen);
1398         if (sock->sendcmsgbuf != NULL)
1399                 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
1400                             sock->sendcmsgbuflen);
1401
1402         sock->magic = 0;
1403
1404         DESTROYLOCK(&sock->lock);
1405
1406         isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1407
1408         *socketp = NULL;
1409 }
1410
1411 #ifdef SO_BSDCOMPAT
1412 /*
1413  * This really should not be necessary to do.  Having to workout
1414  * which kernel version we are on at run time so that we don't cause
1415  * the kernel to issue a warning about us using a deprecated socket option.
1416  * Such warnings should *never* be on by default in production kernels.
1417  *
1418  * We can't do this a build time because executables are moved between
1419  * machines and hence kernels.
1420  *
1421  * We can't just not set SO_BSDCOMAT because some kernels require it.
1422  */
1423
1424 static isc_once_t         bsdcompat_once = ISC_ONCE_INIT;
1425 isc_boolean_t bsdcompat = ISC_TRUE;
1426
1427 static void
1428 clear_bsdcompat(void) {
1429 #ifdef __linux__
1430          struct utsname buf;
1431          char *endp;
1432          long int major;
1433          long int minor;
1434
1435          uname(&buf);    /* Can only fail if buf is bad in Linux. */
1436
1437          /* Paranoia in parsing can be increased, but we trust uname(). */
1438          major = strtol(buf.release, &endp, 10);
1439          if (*endp == '.') {
1440                 minor = strtol(endp+1, &endp, 10);
1441                 if ((major > 2) || ((major == 2) && (minor >= 4))) {
1442                         bsdcompat = ISC_FALSE;
1443                 }
1444          }
1445 #endif /* __linux __ */
1446 }
1447 #endif
1448
1449 /*%
1450  * Create a new 'type' socket managed by 'manager'.  Events
1451  * will be posted to 'task' and when dispatched 'action' will be
1452  * called with 'arg' as the arg value.  The new socket is returned
1453  * in 'socketp'.
1454  */
1455 isc_result_t
1456 isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1457                   isc_socket_t **socketp)
1458 {
1459         isc_socket_t *sock = NULL;
1460         isc_result_t ret;
1461 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
1462         int on = 1;
1463 #endif
1464         char strbuf[ISC_STRERRORSIZE];
1465         const char *err = "socket";
1466         int tries = 0;
1467
1468         REQUIRE(VALID_MANAGER(manager));
1469         REQUIRE(socketp != NULL && *socketp == NULL);
1470
1471         ret = allocate_socket(manager, type, &sock);
1472         if (ret != ISC_R_SUCCESS)
1473                 return (ret);
1474
1475         sock->pf = pf;
1476  again:
1477         switch (type) {
1478         case isc_sockettype_udp:
1479                 sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1480                 break;
1481         case isc_sockettype_tcp:
1482                 sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1483                 break;
1484         }
1485         if (sock->fd == -1 && errno == EINTR && tries++ < 42)
1486                 goto again;
1487
1488 #ifdef F_DUPFD
1489         /*
1490          * Leave a space for stdio and TCP to work in.
1491          */
1492         if (manager->reserved != 0 && type == isc_sockettype_udp &&
1493             sock->fd >= 0 && sock->fd < manager->reserved) {
1494                 int new, tmp;
1495                 new = fcntl(sock->fd, F_DUPFD, manager->reserved);
1496                 tmp = errno;
1497                 (void)close(sock->fd);
1498                 errno = tmp;
1499                 sock->fd = new;
1500                 err = "isc_socket_create: fcntl/reserved";
1501         } else if (sock->fd >= 0 && sock->fd < 20) {
1502                 int new, tmp;
1503                 new = fcntl(sock->fd, F_DUPFD, 20);
1504                 tmp = errno;
1505                 (void)close(sock->fd);
1506                 errno = tmp;
1507                 sock->fd = new;
1508                 err = "isc_socket_create: fcntl";
1509         }
1510 #endif
1511
1512         if (sock->fd >= (int)manager->fdsize) {
1513                 (void)close(sock->fd);
1514                 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1515                                ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1516                                isc_msgcat, ISC_MSGSET_SOCKET,
1517                                ISC_MSG_TOOMANYFDS,
1518                                "%s: too many open file descriptors", "socket");
1519                 free_socket(&sock);
1520                 return (ISC_R_NORESOURCES);
1521         }
1522
1523         if (sock->fd < 0) {
1524                 free_socket(&sock);
1525
1526                 switch (errno) {
1527                 case EMFILE:
1528                 case ENFILE:
1529                 case ENOBUFS:
1530                         return (ISC_R_NORESOURCES);
1531
1532                 case EPROTONOSUPPORT:
1533                 case EPFNOSUPPORT:
1534                 case EAFNOSUPPORT:
1535                 /*
1536                  * Linux 2.2 (and maybe others) return EINVAL instead of
1537                  * EAFNOSUPPORT.
1538                  */
1539                 case EINVAL:
1540                         return (ISC_R_FAMILYNOSUPPORT);
1541
1542                 default:
1543                         isc__strerror(errno, strbuf, sizeof(strbuf));
1544                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1545                                          "%s() %s: %s", err,
1546                                          isc_msgcat_get(isc_msgcat,
1547                                                         ISC_MSGSET_GENERAL,
1548                                                         ISC_MSG_FAILED,
1549                                                         "failed"),
1550                                          strbuf);
1551                         return (ISC_R_UNEXPECTED);
1552                 }
1553         }
1554
1555         if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
1556                 (void)close(sock->fd);
1557                 free_socket(&sock);
1558                 return (ISC_R_UNEXPECTED);
1559         }
1560
1561 #ifdef SO_BSDCOMPAT
1562         RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
1563                                   clear_bsdcompat) == ISC_R_SUCCESS);
1564         if (bsdcompat && setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
1565                                     (void *)&on, sizeof(on)) < 0) {
1566                 isc__strerror(errno, strbuf, sizeof(strbuf));
1567                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1568                                  "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
1569                                  sock->fd,
1570                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1571                                                 ISC_MSG_FAILED, "failed"),
1572                                  strbuf);
1573                 /* Press on... */
1574         }
1575 #endif
1576
1577 #ifdef SO_NOSIGPIPE
1578         if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE,
1579                        (void *)&on, sizeof(on)) < 0) {
1580                 isc__strerror(errno, strbuf, sizeof(strbuf));
1581                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1582                                  "setsockopt(%d, SO_NOSIGPIPE) %s: %s",
1583                                  sock->fd,
1584                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1585                                                 ISC_MSG_FAILED, "failed"),
1586                                  strbuf);
1587                 /* Press on... */
1588         }
1589 #endif
1590
1591 #if defined(USE_CMSG)
1592         if (type == isc_sockettype_udp) {
1593
1594 #if defined(SO_TIMESTAMP)
1595                 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1596                                (void *)&on, sizeof(on)) < 0
1597                     && errno != ENOPROTOOPT) {
1598                         isc__strerror(errno, strbuf, sizeof(strbuf));
1599                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1600                                          "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1601                                          sock->fd,
1602                                          isc_msgcat_get(isc_msgcat,
1603                                                         ISC_MSGSET_GENERAL,
1604                                                         ISC_MSG_FAILED,
1605                                                         "failed"),
1606                                          strbuf);
1607                         /* Press on... */
1608                 }
1609 #endif /* SO_TIMESTAMP */
1610
1611 #if defined(ISC_PLATFORM_HAVEIPV6)
1612                 if (pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
1613                         /*
1614                          * Warn explicitly because this anomaly can be hidden
1615                          * in usual operation (and unexpectedly appear later).
1616                          */
1617                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1618                                          "No buffer available to receive "
1619                                          "IPv6 destination");
1620                 }
1621 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1622 #ifdef IPV6_RECVPKTINFO
1623                 /* RFC 3542 */
1624                 if ((pf == AF_INET6)
1625                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1626                                    (void *)&on, sizeof(on)) < 0)) {
1627                         isc__strerror(errno, strbuf, sizeof(strbuf));
1628                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1629                                          "setsockopt(%d, IPV6_RECVPKTINFO) "
1630                                          "%s: %s", sock->fd,
1631                                          isc_msgcat_get(isc_msgcat,
1632                                                         ISC_MSGSET_GENERAL,
1633                                                         ISC_MSG_FAILED,
1634                                                         "failed"),
1635                                          strbuf);
1636                 }
1637 #else
1638                 /* RFC 2292 */
1639                 if ((pf == AF_INET6)
1640                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1641                                    (void *)&on, sizeof(on)) < 0)) {
1642                         isc__strerror(errno, strbuf, sizeof(strbuf));
1643                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1644                                          "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1645                                          sock->fd,
1646                                          isc_msgcat_get(isc_msgcat,
1647                                                         ISC_MSGSET_GENERAL,
1648                                                         ISC_MSG_FAILED,
1649                                                         "failed"),
1650                                          strbuf);
1651                 }
1652 #endif /* IPV6_RECVPKTINFO */
1653 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
1654 #ifdef IPV6_USE_MIN_MTU        /* RFC 3542, not too common yet*/
1655                 /* use minimum MTU */
1656                 if (pf == AF_INET6) {
1657                         (void)setsockopt(sock->fd, IPPROTO_IPV6,
1658                                          IPV6_USE_MIN_MTU,
1659                                          (void *)&on, sizeof(on));
1660                 }
1661 #endif
1662 #endif /* ISC_PLATFORM_HAVEIPV6 */
1663
1664         }
1665 #endif /* USE_CMSG */
1666
1667         sock->references = 1;
1668         *socketp = sock;
1669
1670         LOCK(&manager->lock);
1671
1672         /*
1673          * Note we don't have to lock the socket like we normally would because
1674          * there are no external references to it yet.
1675          */
1676
1677         manager->fds[sock->fd] = sock;
1678         manager->fdstate[sock->fd] = MANAGED;
1679         ISC_LIST_APPEND(manager->socklist, sock, link);
1680         if (manager->maxfd < sock->fd)
1681                 manager->maxfd = sock->fd;
1682
1683         UNLOCK(&manager->lock);
1684
1685         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1686                    ISC_MSG_CREATED, "created");
1687
1688         return (ISC_R_SUCCESS);
1689 }
1690
1691 /*
1692  * Attach to a socket.  Caller must explicitly detach when it is done.
1693  */
1694 void
1695 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1696         REQUIRE(VALID_SOCKET(sock));
1697         REQUIRE(socketp != NULL && *socketp == NULL);
1698
1699         LOCK(&sock->lock);
1700         sock->references++;
1701         UNLOCK(&sock->lock);
1702
1703         *socketp = sock;
1704 }
1705
1706 /*
1707  * Dereference a socket.  If this is the last reference to it, clean things
1708  * up by destroying the socket.
1709  */
1710 void
1711 isc_socket_detach(isc_socket_t **socketp) {
1712         isc_socket_t *sock;
1713         isc_boolean_t kill_socket = ISC_FALSE;
1714
1715         REQUIRE(socketp != NULL);
1716         sock = *socketp;
1717         REQUIRE(VALID_SOCKET(sock));
1718
1719         LOCK(&sock->lock);
1720         REQUIRE(sock->references > 0);
1721         sock->references--;
1722         if (sock->references == 0)
1723                 kill_socket = ISC_TRUE;
1724         UNLOCK(&sock->lock);
1725
1726         if (kill_socket)
1727                 destroy(&sock);
1728
1729         *socketp = NULL;
1730 }
1731
1732 /*
1733  * I/O is possible on a given socket.  Schedule an event to this task that
1734  * will call an internal function to do the I/O.  This will charge the
1735  * task with the I/O operation and let our select loop handler get back
1736  * to doing something real as fast as possible.
1737  *
1738  * The socket and manager must be locked before calling this function.
1739  */
1740 static void
1741 dispatch_recv(isc_socket_t *sock) {
1742         intev_t *iev;
1743         isc_socketevent_t *ev;
1744
1745         INSIST(!sock->pending_recv);
1746
1747         ev = ISC_LIST_HEAD(sock->recv_list);
1748         if (ev == NULL)
1749                 return;
1750
1751         sock->pending_recv = 1;
1752         iev = &sock->readable_ev;
1753
1754         socket_log(sock, NULL, EVENT, NULL, 0, 0,
1755                    "dispatch_recv:  event %p -> task %p", ev, ev->ev_sender);
1756
1757         sock->references++;
1758         iev->ev_sender = sock;
1759         iev->ev_action = internal_recv;
1760         iev->ev_arg = sock;
1761
1762         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1763 }
1764
1765 static void
1766 dispatch_send(isc_socket_t *sock) {
1767         intev_t *iev;
1768         isc_socketevent_t *ev;
1769
1770         INSIST(!sock->pending_send);
1771
1772         ev = ISC_LIST_HEAD(sock->send_list);
1773         if (ev == NULL)
1774                 return;
1775
1776         sock->pending_send = 1;
1777         iev = &sock->writable_ev;
1778
1779         socket_log(sock, NULL, EVENT, NULL, 0, 0,
1780                    "dispatch_send:  event %p -> task %p", ev, ev->ev_sender);
1781
1782         sock->references++;
1783         iev->ev_sender = sock;
1784         iev->ev_action = internal_send;
1785         iev->ev_arg = sock;
1786
1787         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1788 }
1789
1790 /*
1791  * Dispatch an internal accept event.
1792  */
1793 static void
1794 dispatch_accept(isc_socket_t *sock) {
1795         intev_t *iev;
1796         isc_socket_newconnev_t *ev;
1797
1798         INSIST(!sock->pending_accept);
1799
1800         /*
1801          * Are there any done events left, or were they all canceled
1802          * before the manager got the socket lock?
1803          */
1804         ev = ISC_LIST_HEAD(sock->accept_list);
1805         if (ev == NULL)
1806                 return;
1807
1808         sock->pending_accept = 1;
1809         iev = &sock->readable_ev;
1810
1811         sock->references++;  /* keep socket around for this internal event */
1812         iev->ev_sender = sock;
1813         iev->ev_action = internal_accept;
1814         iev->ev_arg = sock;
1815
1816         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1817 }
1818
1819 static void
1820 dispatch_connect(isc_socket_t *sock) {
1821         intev_t *iev;
1822         isc_socket_connev_t *ev;
1823
1824         iev = &sock->writable_ev;
1825
1826         ev = sock->connect_ev;
1827         INSIST(ev != NULL); /* XXX */
1828
1829         INSIST(sock->connecting);
1830
1831         sock->references++;  /* keep socket around for this internal event */
1832         iev->ev_sender = sock;
1833         iev->ev_action = internal_connect;
1834         iev->ev_arg = sock;
1835
1836         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1837 }
1838
1839 /*
1840  * Dequeue an item off the given socket's read queue, set the result code
1841  * in the done event to the one provided, and send it to the task it was
1842  * destined for.
1843  *
1844  * If the event to be sent is on a list, remove it before sending.  If
1845  * asked to, send and detach from the socket as well.
1846  *
1847  * Caller must have the socket locked if the event is attached to the socket.
1848  */
1849 static void
1850 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1851         isc_task_t *task;
1852
1853         task = (*dev)->ev_sender;
1854
1855         (*dev)->ev_sender = sock;
1856
1857         if (ISC_LINK_LINKED(*dev, ev_link))
1858                 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1859
1860         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1861             == ISC_SOCKEVENTATTR_ATTACHED)
1862                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1863         else
1864                 isc_task_send(task, (isc_event_t **)dev);
1865 }
1866
1867 /*
1868  * See comments for send_recvdone_event() above.
1869  *
1870  * Caller must have the socket locked if the event is attached to the socket.
1871  */
1872 static void
1873 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1874         isc_task_t *task;
1875
1876         INSIST(dev != NULL && *dev != NULL);
1877
1878         task = (*dev)->ev_sender;
1879         (*dev)->ev_sender = sock;
1880
1881         if (ISC_LINK_LINKED(*dev, ev_link))
1882                 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1883
1884         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1885             == ISC_SOCKEVENTATTR_ATTACHED)
1886                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1887         else
1888                 isc_task_send(task, (isc_event_t **)dev);
1889 }
1890
1891 /*
1892  * Call accept() on a socket, to get the new file descriptor.  The listen
1893  * socket is used as a prototype to create a new isc_socket_t.  The new
1894  * socket has one outstanding reference.  The task receiving the event
1895  * will be detached from just after the event is delivered.
1896  *
1897  * On entry to this function, the event delivered is the internal
1898  * readable event, and the first item on the accept_list should be
1899  * the done event we want to send.  If the list is empty, this is a no-op,
1900  * so just unlock and return.
1901  */
1902 static void
1903 internal_accept(isc_task_t *me, isc_event_t *ev) {
1904         isc_socket_t *sock;
1905         isc_socketmgr_t *manager;
1906         isc_socket_newconnev_t *dev;
1907         isc_task_t *task;
1908         ISC_SOCKADDR_LEN_T addrlen;
1909         int fd;
1910         isc_result_t result = ISC_R_SUCCESS;
1911         char strbuf[ISC_STRERRORSIZE];
1912         const char *err = "accept";
1913
1914         UNUSED(me);
1915
1916         sock = ev->ev_sender;
1917         INSIST(VALID_SOCKET(sock));
1918
1919         LOCK(&sock->lock);
1920         socket_log(sock, NULL, TRACE,
1921                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1922                    "internal_accept called, locked socket");
1923
1924         manager = sock->manager;
1925         INSIST(VALID_MANAGER(manager));
1926
1927         INSIST(sock->listener);
1928         INSIST(sock->pending_accept == 1);
1929         sock->pending_accept = 0;
1930
1931         INSIST(sock->references > 0);
1932         sock->references--;  /* the internal event is done with this socket */
1933         if (sock->references == 0) {
1934                 UNLOCK(&sock->lock);
1935                 destroy(&sock);
1936                 return;
1937         }
1938
1939         /*
1940          * Get the first item off the accept list.
1941          * If it is empty, unlock the socket and return.
1942          */
1943         dev = ISC_LIST_HEAD(sock->accept_list);
1944         if (dev == NULL) {
1945                 UNLOCK(&sock->lock);
1946                 return;
1947         }
1948
1949         /*
1950          * Try to accept the new connection.  If the accept fails with
1951          * EAGAIN or EINTR, simply poke the watcher to watch this socket
1952          * again.  Also ignore ECONNRESET, which has been reported to
1953          * be spuriously returned on Linux 2.2.19 although it is not
1954          * a documented error for accept().  ECONNABORTED has been
1955          * reported for Solaris 8.  The rest are thrown in not because
1956          * we have seen them but because they are ignored by other
1957          * deamons such as BIND 8 and Apache.
1958          */
1959
1960         addrlen = sizeof(dev->newsocket->address.type);
1961         memset(&dev->newsocket->address.type, 0, addrlen);
1962         fd = accept(sock->fd, &dev->newsocket->address.type.sa,
1963                     (void *)&addrlen);
1964
1965 #ifdef F_DUPFD
1966         /*
1967          * Leave a space for stdio to work in.
1968          */
1969         if (fd >= 0 && fd < 20) {
1970                 int new, tmp;
1971                 new = fcntl(fd, F_DUPFD, 20);
1972                 tmp = errno;
1973                 (void)close(fd);
1974                 errno = tmp;
1975                 fd = new;
1976                 err = "accept/fcntl";
1977         }
1978 #endif
1979
1980         if (fd < 0) {
1981                 if (SOFT_ERROR(errno))
1982                         goto soft_error;
1983                 switch (errno) {
1984                 case ENFILE:
1985                 case EMFILE:
1986                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1987                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1988                                        isc_msgcat, ISC_MSGSET_SOCKET,
1989                                        ISC_MSG_TOOMANYFDS,
1990                                        "%s: too many open file descriptors",
1991                                        err);
1992                         goto soft_error;
1993
1994                 case ENOBUFS:
1995                 case ENOMEM:
1996                 case ECONNRESET:
1997                 case ECONNABORTED:
1998                 case EHOSTUNREACH:
1999                 case EHOSTDOWN:
2000                 case ENETUNREACH:
2001                 case ENETDOWN:
2002                 case ECONNREFUSED:
2003 #ifdef EPROTO
2004                 case EPROTO:
2005 #endif
2006 #ifdef ENONET
2007                 case ENONET:
2008 #endif
2009                         goto soft_error;
2010                 default:
2011                         break;
2012                 }
2013                 isc__strerror(errno, strbuf, sizeof(strbuf));
2014                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2015                                  "internal_accept: %s() %s: %s", err,
2016                                  isc_msgcat_get(isc_msgcat,
2017                                                 ISC_MSGSET_GENERAL,
2018                                                 ISC_MSG_FAILED,
2019                                                 "failed"),
2020                                  strbuf);
2021                 fd = -1;
2022                 result = ISC_R_UNEXPECTED;
2023         } else {
2024                 if (addrlen == 0U) {
2025                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2026                                          "internal_accept(): "
2027                                          "accept() failed to return "
2028                                          "remote address");
2029
2030                         (void)close(fd);
2031                         goto soft_error;
2032                 } else if (dev->newsocket->address.type.sa.sa_family !=
2033                            sock->pf)
2034                 {
2035                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2036                                          "internal_accept(): "
2037                                          "accept() returned peer address "
2038                                          "family %u (expected %u)",
2039                                          dev->newsocket->address.
2040                                          type.sa.sa_family,
2041                                          sock->pf);
2042                         (void)close(fd);
2043                         goto soft_error;
2044                 } else if (fd >= (int)manager->fdsize) {
2045                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2046                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2047                                        isc_msgcat, ISC_MSGSET_SOCKET,
2048                                        ISC_MSG_TOOMANYFDS,
2049                                        "%s: too many open file descriptors",
2050                                        "accept");
2051                         (void)close(fd);
2052                         goto soft_error;
2053                 }
2054         }
2055
2056         if (fd != -1) {
2057                 dev->newsocket->address.length = addrlen;
2058                 dev->newsocket->pf = sock->pf;
2059         }
2060
2061         /*
2062          * Pull off the done event.
2063          */
2064         ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2065
2066         /*
2067          * Poke watcher if there are more pending accepts.
2068          */
2069         if (!ISC_LIST_EMPTY(sock->accept_list))
2070                 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2071
2072         UNLOCK(&sock->lock);
2073
2074         if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) {
2075                 (void)close(fd);
2076                 fd = -1;
2077                 result = ISC_R_UNEXPECTED;
2078         }
2079
2080         /*
2081          * -1 means the new socket didn't happen.
2082          */
2083         if (fd != -1) {
2084                 LOCK(&manager->lock);
2085                 ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
2086
2087                 dev->newsocket->fd = fd;
2088                 dev->newsocket->bound = 1;
2089                 dev->newsocket->connected = 1;
2090
2091                 /*
2092                  * Save away the remote address
2093                  */
2094                 dev->address = dev->newsocket->address;
2095
2096                 manager->fds[fd] = dev->newsocket;
2097                 manager->fdstate[fd] = MANAGED;
2098                 if (manager->maxfd < fd)
2099                         manager->maxfd = fd;
2100
2101                 socket_log(sock, &dev->newsocket->address, CREATION,
2102                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2103                            "accepted connection, new socket %p",
2104                            dev->newsocket);
2105
2106                 UNLOCK(&manager->lock);
2107         } else {
2108                 dev->newsocket->references--;
2109                 free_socket(&dev->newsocket);
2110         }
2111
2112         /*
2113          * Fill in the done event details and send it off.
2114          */
2115         dev->result = result;
2116         task = dev->ev_sender;
2117         dev->ev_sender = sock;
2118
2119         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2120         return;
2121
2122  soft_error:
2123         select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2124         UNLOCK(&sock->lock);
2125         return;
2126 }
2127
2128 static void
2129 internal_recv(isc_task_t *me, isc_event_t *ev) {
2130         isc_socketevent_t *dev;
2131         isc_socket_t *sock;
2132
2133         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
2134
2135         sock = ev->ev_sender;
2136         INSIST(VALID_SOCKET(sock));
2137
2138         LOCK(&sock->lock);
2139         socket_log(sock, NULL, IOEVENT,
2140                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2141                    "internal_recv: task %p got event %p", me, ev);
2142
2143         INSIST(sock->pending_recv == 1);
2144         sock->pending_recv = 0;
2145
2146         INSIST(sock->references > 0);
2147         sock->references--;  /* the internal event is done with this socket */
2148         if (sock->references == 0) {
2149                 UNLOCK(&sock->lock);
2150                 destroy(&sock);
2151                 return;
2152         }
2153
2154         /*
2155          * Try to do as much I/O as possible on this socket.  There are no
2156          * limits here, currently.
2157          */
2158         dev = ISC_LIST_HEAD(sock->recv_list);
2159         while (dev != NULL) {
2160                 switch (doio_recv(sock, dev)) {
2161                 case DOIO_SOFT:
2162                         goto poke;
2163
2164                 case DOIO_EOF:
2165                         /*
2166                          * read of 0 means the remote end was closed.
2167                          * Run through the event queue and dispatch all
2168                          * the events with an EOF result code.
2169                          */
2170                         do {
2171                                 dev->result = ISC_R_EOF;
2172                                 send_recvdone_event(sock, &dev);
2173                                 dev = ISC_LIST_HEAD(sock->recv_list);
2174                         } while (dev != NULL);
2175                         goto poke;
2176
2177                 case DOIO_SUCCESS:
2178                 case DOIO_HARD:
2179                         send_recvdone_event(sock, &dev);
2180                         break;
2181                 }
2182
2183                 dev = ISC_LIST_HEAD(sock->recv_list);
2184         }
2185
2186  poke:
2187         if (!ISC_LIST_EMPTY(sock->recv_list))
2188                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2189
2190         UNLOCK(&sock->lock);
2191 }
2192
2193 static void
2194 internal_send(isc_task_t *me, isc_event_t *ev) {
2195         isc_socketevent_t *dev;
2196         isc_socket_t *sock;
2197
2198         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2199
2200         /*
2201          * Find out what socket this is and lock it.
2202          */
2203         sock = (isc_socket_t *)ev->ev_sender;
2204         INSIST(VALID_SOCKET(sock));
2205
2206         LOCK(&sock->lock);
2207         socket_log(sock, NULL, IOEVENT,
2208                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2209                    "internal_send: task %p got event %p", me, ev);
2210
2211         INSIST(sock->pending_send == 1);
2212         sock->pending_send = 0;
2213
2214         INSIST(sock->references > 0);
2215         sock->references--;  /* the internal event is done with this socket */
2216         if (sock->references == 0) {
2217                 UNLOCK(&sock->lock);
2218                 destroy(&sock);
2219                 return;
2220         }
2221
2222         /*
2223          * Try to do as much I/O as possible on this socket.  There are no
2224          * limits here, currently.
2225          */
2226         dev = ISC_LIST_HEAD(sock->send_list);
2227         while (dev != NULL) {
2228                 switch (doio_send(sock, dev)) {
2229                 case DOIO_SOFT:
2230                         goto poke;
2231
2232                 case DOIO_HARD:
2233                 case DOIO_SUCCESS:
2234                         send_senddone_event(sock, &dev);
2235                         break;
2236                 }
2237
2238                 dev = ISC_LIST_HEAD(sock->send_list);
2239         }
2240
2241  poke:
2242         if (!ISC_LIST_EMPTY(sock->send_list))
2243                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2244
2245         UNLOCK(&sock->lock);
2246 }
2247
2248 static void
2249 process_fds(isc_socketmgr_t *manager, int maxfd,
2250             fd_set *readfds, fd_set *writefds)
2251 {
2252         int i;
2253         isc_socket_t *sock;
2254         isc_boolean_t unlock_sock;
2255
2256         REQUIRE(maxfd <= (int)manager->fdsize);
2257
2258         /*
2259          * Process read/writes on other fds here.  Avoid locking
2260          * and unlocking twice if both reads and writes are possible.
2261          */
2262         for (i = 0; i < maxfd; i++) {
2263 #ifdef ISC_PLATFORM_USETHREADS
2264                 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
2265                         continue;
2266 #endif /* ISC_PLATFORM_USETHREADS */
2267
2268                 if (manager->fdstate[i] == CLOSE_PENDING) {
2269                         manager->fdstate[i] = CLOSED;
2270                         FD_CLR(i, manager->read_fds);
2271                         FD_CLR(i, manager->write_fds);
2272
2273                         (void)close(i);
2274
2275                         continue;
2276                 }
2277
2278                 sock = manager->fds[i];
2279                 unlock_sock = ISC_FALSE;
2280                 if (FD_ISSET(i, readfds)) {
2281                         if (sock == NULL) {
2282                                 FD_CLR(i, manager->read_fds);
2283                                 goto check_write;
2284                         }
2285                         unlock_sock = ISC_TRUE;
2286                         LOCK(&sock->lock);
2287                         if (!SOCK_DEAD(sock)) {
2288                                 if (sock->listener)
2289                                         dispatch_accept(sock);
2290                                 else
2291                                         dispatch_recv(sock);
2292                         }
2293                         FD_CLR(i, manager->read_fds);
2294                 }
2295         check_write:
2296                 if (FD_ISSET(i, writefds)) {
2297                         if (sock == NULL) {
2298                                 FD_CLR(i, manager->write_fds);
2299                                 continue;
2300                         }
2301                         if (!unlock_sock) {
2302                                 unlock_sock = ISC_TRUE;
2303                                 LOCK(&sock->lock);
2304                         }
2305                         if (!SOCK_DEAD(sock)) {
2306                                 if (sock->connecting)
2307                                         dispatch_connect(sock);
2308                                 else
2309                                         dispatch_send(sock);
2310                         }
2311                         FD_CLR(i, manager->write_fds);
2312                 }
2313                 if (unlock_sock)
2314                         UNLOCK(&sock->lock);
2315         }
2316 }
2317
2318 #ifdef ISC_PLATFORM_USETHREADS
2319 /*
2320  * This is the thread that will loop forever, always in a select or poll
2321  * call.
2322  *
2323  * When select returns something to do, track down what thread gets to do
2324  * this I/O and post the event to it.
2325  */
2326 static isc_threadresult_t
2327 watcher(void *uap) {
2328         isc_socketmgr_t *manager = uap;
2329         isc_boolean_t done;
2330         int ctlfd;
2331         int cc;
2332         int msg, fd;
2333         int maxfd;
2334         char strbuf[ISC_STRERRORSIZE];
2335
2336         /*
2337          * Get the control fd here.  This will never change.
2338          */
2339         LOCK(&manager->lock);
2340         ctlfd = manager->pipe_fds[0];
2341
2342         done = ISC_FALSE;
2343         while (!done) {
2344                 do {
2345                         memcpy(manager->read_fds_copy, manager->read_fds,
2346                                manager->fd_bufsize);
2347                         memcpy(manager->write_fds_copy, manager->write_fds,
2348                                manager->fd_bufsize);
2349                         maxfd = manager->maxfd + 1;
2350
2351                         UNLOCK(&manager->lock);
2352
2353                         cc = select(maxfd, manager->read_fds_copy,
2354                                     manager->write_fds_copy, NULL, NULL);
2355                         if (cc < 0) {
2356                                 if (!SOFT_ERROR(errno)) {
2357                                         isc__strerror(errno, strbuf,
2358                                                       sizeof(strbuf));
2359                                         FATAL_ERROR(__FILE__, __LINE__,
2360                                                     "select() %s: %s",
2361                                                     isc_msgcat_get(isc_msgcat,
2362                                                             ISC_MSGSET_GENERAL,
2363                                                             ISC_MSG_FAILED,
2364                                                             "failed"),
2365                                                     strbuf);
2366                                 }
2367                         }
2368
2369                         LOCK(&manager->lock);
2370                 } while (cc < 0);
2371
2372
2373                 /*
2374                  * Process reads on internal, control fd.
2375                  */
2376                 if (FD_ISSET(ctlfd, manager->read_fds_copy)) {
2377                         for (;;) {
2378                                 select_readmsg(manager, &fd, &msg);
2379
2380                                 manager_log(manager, IOEVENT,
2381                                             isc_msgcat_get(isc_msgcat,
2382                                                      ISC_MSGSET_SOCKET,
2383                                                      ISC_MSG_WATCHERMSG,
2384                                                      "watcher got message %d"),
2385                                                      msg);
2386
2387                                 /*
2388                                  * Nothing to read?
2389                                  */
2390                                 if (msg == SELECT_POKE_NOTHING)
2391                                         break;
2392
2393                                 /*
2394                                  * Handle shutdown message.  We really should
2395                                  * jump out of this loop right away, but
2396                                  * it doesn't matter if we have to do a little
2397                                  * more work first.
2398                                  */
2399                                 if (msg == SELECT_POKE_SHUTDOWN) {
2400                                         done = ISC_TRUE;
2401
2402                                         break;
2403                                 }
2404
2405                                 /*
2406                                  * This is a wakeup on a socket.  Look
2407                                  * at the event queue for both read and write,
2408                                  * and decide if we need to watch on it now
2409                                  * or not.
2410                                  */
2411                                 wakeup_socket(manager, fd, msg);
2412                         }
2413                 }
2414
2415                 process_fds(manager, maxfd, manager->read_fds_copy,
2416                             manager->write_fds_copy);
2417         }
2418
2419         manager_log(manager, TRACE,
2420                     isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2421                                    ISC_MSG_EXITING, "watcher exiting"));
2422
2423         UNLOCK(&manager->lock);
2424         return ((isc_threadresult_t)0);
2425 }
2426 #endif /* ISC_PLATFORM_USETHREADS */
2427
2428 void
2429 isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
2430
2431         REQUIRE(VALID_MANAGER(manager));
2432
2433         manager->reserved = reserved;
2434 }
2435
2436 /*
2437  * Initialize fdsets in socketmgr structure.
2438  */
2439 static isc_result_t
2440 create_fdsets(isc_socketmgr_t *manager, isc_mem_t *mctx) {
2441 #if ISC_SOCKET_FDSETSIZE > FD_SETSIZE
2442         manager->fdsize = ISC_SOCKET_FDSETSIZE;
2443         manager->fd_bufsize = howmany(ISC_SOCKET_FDSETSIZE, NFDBITS) *
2444                 sizeof(fd_mask);
2445 #else
2446         manager->fdsize = FD_SETSIZE;
2447         manager->fd_bufsize = sizeof(fd_set);
2448 #endif
2449
2450         manager->fds = NULL;
2451         manager->fdstate = NULL;
2452         manager->read_fds = NULL;
2453         manager->read_fds_copy = NULL;
2454         manager->write_fds = NULL;
2455         manager->write_fds_copy = NULL;
2456
2457         manager->fds = isc_mem_get(mctx,
2458                                    manager->fdsize * sizeof(manager->fds[0]));
2459         if (manager->fds == NULL)
2460                 goto fail;
2461
2462         manager->fdstate = isc_mem_get(mctx, manager->fdsize *
2463                                        sizeof(manager->fdstate[0]));
2464         if (manager->fdstate == NULL)
2465                 goto fail;
2466
2467         manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
2468         if (manager->read_fds == NULL)
2469                 goto fail;
2470         manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
2471         if (manager->read_fds_copy == NULL)
2472                 goto fail;
2473         manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
2474         if (manager->write_fds == NULL)
2475                 goto fail;
2476         manager->write_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
2477         if (manager->write_fds_copy == NULL)
2478                 goto fail;
2479
2480         return (ISC_R_SUCCESS);
2481
2482   fail:
2483         cleanup_fdsets(manager, mctx);
2484         return (ISC_R_NOMEMORY);
2485 }
2486
2487 /*
2488  * Clean up fdsets in socketmgr structure.
2489  */
2490 static void
2491 cleanup_fdsets(isc_socketmgr_t *manager, isc_mem_t *mctx) {
2492         if (manager->fds != NULL) {
2493                 isc_mem_put(mctx, manager->fds,
2494                             manager->fdsize * sizeof(manager->fds[0]));
2495         }
2496         if (manager->fdstate != NULL) {
2497                 isc_mem_put(mctx, manager->fdstate,
2498                             manager->fdsize * sizeof(manager->fdstate[0]));
2499         }
2500         if (manager->read_fds != NULL)
2501                 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
2502         if (manager->read_fds_copy != NULL)
2503                 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize);
2504         if (manager->write_fds != NULL)
2505                 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
2506         if (manager->write_fds_copy != NULL)
2507                 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
2508 }
2509
2510 /*
2511  * Create a new socket manager.
2512  */
2513 isc_result_t
2514 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2515         isc_socketmgr_t *manager;
2516 #ifdef ISC_PLATFORM_USETHREADS
2517         char strbuf[ISC_STRERRORSIZE];
2518 #endif
2519         isc_result_t result;
2520
2521         REQUIRE(managerp != NULL && *managerp == NULL);
2522
2523 #ifndef ISC_PLATFORM_USETHREADS
2524         if (socketmgr != NULL) {
2525                 socketmgr->refs++;
2526                 *managerp = socketmgr;
2527                 return (ISC_R_SUCCESS);
2528         }
2529 #endif /* ISC_PLATFORM_USETHREADS */
2530
2531         manager = isc_mem_get(mctx, sizeof(*manager));
2532         if (manager == NULL)
2533                 return (ISC_R_NOMEMORY);
2534
2535         result = create_fdsets(manager, mctx);
2536         if (result != ISC_R_SUCCESS) {
2537                 cleanup_fdsets(manager, mctx);
2538                 isc_mem_put(mctx, manager, sizeof(*manager));
2539                 return (result);
2540         }
2541
2542         manager->magic = SOCKET_MANAGER_MAGIC;
2543         manager->mctx = NULL;
2544         ISC_LIST_INIT(manager->socklist);
2545         result = isc_mutex_init(&manager->lock);
2546         if (result != ISC_R_SUCCESS) {
2547                 cleanup_fdsets(manager, mctx);
2548                 isc_mem_put(mctx, manager, sizeof(*manager));
2549                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2550                                  "isc_mutex_init() %s",
2551                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2552                                                 ISC_MSG_FAILED, "failed"));
2553                 return (ISC_R_UNEXPECTED);
2554         }
2555 #ifdef ISC_PLATFORM_USETHREADS
2556         if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2557                 cleanup_fdsets(manager, mctx);
2558                 DESTROYLOCK(&manager->lock);
2559                 isc_mem_put(mctx, manager, sizeof(*manager));
2560                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2561                                  "isc_condition_init() %s",
2562                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2563                                                 ISC_MSG_FAILED, "failed"));
2564                 return (ISC_R_UNEXPECTED);
2565         }
2566
2567         /*
2568          * Create the special fds that will be used to wake up the
2569          * select/poll loop when something internal needs to be done.
2570          */
2571         if (pipe(manager->pipe_fds) != 0) {
2572                 cleanup_fdsets(manager, mctx);
2573                 DESTROYLOCK(&manager->lock);
2574                 isc_mem_put(mctx, manager, sizeof(*manager));
2575                 isc__strerror(errno, strbuf, sizeof(strbuf));
2576                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2577                                  "pipe() %s: %s",
2578                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2579                                                 ISC_MSG_FAILED, "failed"),
2580                                  strbuf);
2581
2582                 return (ISC_R_UNEXPECTED);
2583         }
2584
2585         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
2586 #if 0
2587         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
2588 #endif
2589 #else /* ISC_PLATFORM_USETHREADS */
2590         manager->refs = 1;
2591 #endif /* ISC_PLATFORM_USETHREADS */
2592
2593         /*
2594          * Set up initial state for the select loop
2595          */
2596         memset(manager->read_fds, 0, manager->fd_bufsize);
2597         memset(manager->write_fds, 0, manager->fd_bufsize);
2598 #ifdef ISC_PLATFORM_USETHREADS
2599         FD_SET(manager->pipe_fds[0], manager->read_fds);
2600         manager->maxfd = manager->pipe_fds[0];
2601 #else /* ISC_PLATFORM_USETHREADS */
2602         manager->maxfd = 0;
2603 #endif /* ISC_PLATFORM_USETHREADS */
2604         manager->reserved = 0;
2605         memset(manager->fdstate, 0,
2606                manager->fdsize * sizeof(manager->fdstate[0]));
2607
2608 #ifdef ISC_PLATFORM_USETHREADS
2609         /*
2610          * Start up the select/poll thread.
2611          */
2612         if (isc_thread_create(watcher, manager, &manager->watcher) !=
2613             ISC_R_SUCCESS) {
2614                 (void)close(manager->pipe_fds[0]);
2615                 (void)close(manager->pipe_fds[1]);
2616                 DESTROYLOCK(&manager->lock);
2617                 isc_mem_put(mctx, manager, sizeof(*manager));
2618                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2619                                  "isc_thread_create() %s",
2620                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2621                                                 ISC_MSG_FAILED, "failed"));
2622                 return (ISC_R_UNEXPECTED);
2623         }
2624 #endif /* ISC_PLATFORM_USETHREADS */
2625         isc_mem_attach(mctx, &manager->mctx);
2626
2627 #ifndef ISC_PLATFORM_USETHREADS
2628         socketmgr = manager;
2629 #endif /* ISC_PLATFORM_USETHREADS */
2630         *managerp = manager;
2631
2632         return (ISC_R_SUCCESS);
2633 }
2634
2635 void
2636 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
2637         isc_socketmgr_t *manager;
2638         int i;
2639         isc_mem_t *mctx;
2640
2641         /*
2642          * Destroy a socket manager.
2643          */
2644
2645         REQUIRE(managerp != NULL);
2646         manager = *managerp;
2647         REQUIRE(VALID_MANAGER(manager));
2648
2649 #ifndef ISC_PLATFORM_USETHREADS
2650         if (manager->refs > 1) {
2651                 manager->refs--;
2652                 *managerp = NULL;
2653                 return;
2654         }
2655 #endif /* ISC_PLATFORM_USETHREADS */
2656
2657         LOCK(&manager->lock);
2658
2659 #ifdef ISC_PLATFORM_USETHREADS
2660         /*
2661          * Wait for all sockets to be destroyed.
2662          */
2663         while (!ISC_LIST_EMPTY(manager->socklist)) {
2664                 manager_log(manager, CREATION,
2665                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2666                                            ISC_MSG_SOCKETSREMAIN,
2667                                            "sockets exist"));
2668                 WAIT(&manager->shutdown_ok, &manager->lock);
2669         }
2670 #else /* ISC_PLATFORM_USETHREADS */
2671         /*
2672          * Hope all sockets have been destroyed.
2673          */
2674         if (!ISC_LIST_EMPTY(manager->socklist)) {
2675                 manager_log(manager, CREATION,
2676                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2677                                            ISC_MSG_SOCKETSREMAIN,
2678                                            "sockets exist"));
2679                 INSIST(0);
2680         }
2681 #endif /* ISC_PLATFORM_USETHREADS */
2682
2683         UNLOCK(&manager->lock);
2684
2685         /*
2686          * Here, poke our select/poll thread.  Do this by closing the write
2687          * half of the pipe, which will send EOF to the read half.
2688          * This is currently a no-op in the non-threaded case.
2689          */
2690         select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
2691
2692 #ifdef ISC_PLATFORM_USETHREADS
2693         /*
2694          * Wait for thread to exit.
2695          */
2696         if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
2697                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2698                                  "isc_thread_join() %s",
2699                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2700                                                 ISC_MSG_FAILED, "failed"));
2701 #endif /* ISC_PLATFORM_USETHREADS */
2702
2703         /*
2704          * Clean up.
2705          */
2706 #ifdef ISC_PLATFORM_USETHREADS
2707         (void)close(manager->pipe_fds[0]);
2708         (void)close(manager->pipe_fds[1]);
2709         (void)isc_condition_destroy(&manager->shutdown_ok);
2710 #endif /* ISC_PLATFORM_USETHREADS */
2711
2712         for (i = 0; i < (int)manager->fdsize; i++)
2713                 if (manager->fdstate[i] == CLOSE_PENDING)
2714                         (void)close(i);
2715
2716         DESTROYLOCK(&manager->lock);
2717         cleanup_fdsets(manager, manager->mctx);
2718         manager->magic = 0;
2719         mctx= manager->mctx;
2720         isc_mem_put(mctx, manager, sizeof(*manager));
2721
2722         isc_mem_detach(&mctx);
2723
2724         *managerp = NULL;
2725 }
2726
2727 static isc_result_t
2728 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2729             unsigned int flags)
2730 {
2731         int io_state;
2732         isc_boolean_t have_lock = ISC_FALSE;
2733         isc_task_t *ntask = NULL;
2734         isc_result_t result = ISC_R_SUCCESS;
2735
2736         dev->ev_sender = task;
2737
2738         if (sock->type == isc_sockettype_udp) {
2739                 io_state = doio_recv(sock, dev);
2740         } else {
2741                 LOCK(&sock->lock);
2742                 have_lock = ISC_TRUE;
2743
2744                 if (ISC_LIST_EMPTY(sock->recv_list))
2745                         io_state = doio_recv(sock, dev);
2746                 else
2747                         io_state = DOIO_SOFT;
2748         }
2749
2750         switch (io_state) {
2751         case DOIO_SOFT:
2752                 /*
2753                  * We couldn't read all or part of the request right now, so
2754                  * queue it.
2755                  *
2756                  * Attach to socket and to task
2757                  */
2758                 isc_task_attach(task, &ntask);
2759                 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2760
2761                 if (!have_lock) {
2762                         LOCK(&sock->lock);
2763                         have_lock = ISC_TRUE;
2764                 }
2765
2766                 /*
2767                  * Enqueue the request.  If the socket was previously not being
2768                  * watched, poke the watcher to start paying attention to it.
2769                  */
2770                 if (ISC_LIST_EMPTY(sock->recv_list))
2771                         select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2772                 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2773
2774                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2775                            "socket_recv: event %p -> task %p",
2776                            dev, ntask);
2777
2778                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2779                         result = ISC_R_INPROGRESS;
2780                 break;
2781
2782         case DOIO_EOF:
2783                 dev->result = ISC_R_EOF;
2784                 /* fallthrough */
2785
2786         case DOIO_HARD:
2787         case DOIO_SUCCESS:
2788                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2789                         send_recvdone_event(sock, &dev);
2790                 break;
2791         }
2792
2793         if (have_lock)
2794                 UNLOCK(&sock->lock);
2795
2796         return (result);
2797 }
2798
2799 isc_result_t
2800 isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2801                  unsigned int minimum, isc_task_t *task,
2802                  isc_taskaction_t action, const void *arg)
2803 {
2804         isc_socketevent_t *dev;
2805         isc_socketmgr_t *manager;
2806         unsigned int iocount;
2807         isc_buffer_t *buffer;
2808
2809         REQUIRE(VALID_SOCKET(sock));
2810         REQUIRE(buflist != NULL);
2811         REQUIRE(!ISC_LIST_EMPTY(*buflist));
2812         REQUIRE(task != NULL);
2813         REQUIRE(action != NULL);
2814
2815         manager = sock->manager;
2816         REQUIRE(VALID_MANAGER(manager));
2817
2818         iocount = isc_bufferlist_availablecount(buflist);
2819         REQUIRE(iocount > 0);
2820
2821         INSIST(sock->bound);
2822
2823         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2824         if (dev == NULL) {
2825                 return (ISC_R_NOMEMORY);
2826         }
2827
2828         /*
2829          * UDP sockets are always partial read
2830          */
2831         if (sock->type == isc_sockettype_udp)
2832                 dev->minimum = 1;
2833         else {
2834                 if (minimum == 0)
2835                         dev->minimum = iocount;
2836                 else
2837                         dev->minimum = minimum;
2838         }
2839
2840         /*
2841          * Move each buffer from the passed in list to our internal one.
2842          */
2843         buffer = ISC_LIST_HEAD(*buflist);
2844         while (buffer != NULL) {
2845                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
2846                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2847                 buffer = ISC_LIST_HEAD(*buflist);
2848         }
2849
2850         return (socket_recv(sock, dev, task, 0));
2851 }
2852
2853 isc_result_t
2854 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
2855                 isc_task_t *task, isc_taskaction_t action, const void *arg)
2856 {
2857         isc_socketevent_t *dev;
2858         isc_socketmgr_t *manager;
2859
2860         REQUIRE(VALID_SOCKET(sock));
2861         REQUIRE(action != NULL);
2862
2863         manager = sock->manager;
2864         REQUIRE(VALID_MANAGER(manager));
2865
2866         INSIST(sock->bound);
2867
2868         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2869         if (dev == NULL)
2870                 return (ISC_R_NOMEMORY);
2871
2872         return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
2873 }
2874
2875 isc_result_t
2876 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
2877                  unsigned int minimum, isc_task_t *task,
2878                  isc_socketevent_t *event, unsigned int flags)
2879 {
2880         event->ev_sender = sock;
2881         event->result = ISC_R_UNEXPECTED;
2882         ISC_LIST_INIT(event->bufferlist);
2883         event->region = *region;
2884         event->n = 0;
2885         event->offset = 0;
2886         event->attributes = 0;
2887
2888         /*
2889          * UDP sockets are always partial read.
2890          */
2891         if (sock->type == isc_sockettype_udp)
2892                 event->minimum = 1;
2893         else {
2894                 if (minimum == 0)
2895                         event->minimum = region->length;
2896                 else
2897                         event->minimum = minimum;
2898         }
2899
2900         return (socket_recv(sock, event, task, flags));
2901 }
2902
2903 static isc_result_t
2904 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2905             isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2906             unsigned int flags)
2907 {
2908         int io_state;
2909         isc_boolean_t have_lock = ISC_FALSE;
2910         isc_task_t *ntask = NULL;
2911         isc_result_t result = ISC_R_SUCCESS;
2912
2913         dev->ev_sender = task;
2914
2915         set_dev_address(address, sock, dev);
2916         if (pktinfo != NULL) {
2917                 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2918                 dev->pktinfo = *pktinfo;
2919
2920                 if (!isc_sockaddr_issitelocal(&dev->address) &&
2921                     !isc_sockaddr_islinklocal(&dev->address)) {
2922                         socket_log(sock, NULL, TRACE, isc_msgcat,
2923                                    ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
2924                                    "pktinfo structure provided, ifindex %u "
2925                                    "(set to 0)", pktinfo->ipi6_ifindex);
2926
2927                         /*
2928                          * Set the pktinfo index to 0 here, to let the
2929                          * kernel decide what interface it should send on.
2930                          */
2931                         dev->pktinfo.ipi6_ifindex = 0;
2932                 }
2933         }
2934
2935         if (sock->type == isc_sockettype_udp)
2936                 io_state = doio_send(sock, dev);
2937         else {
2938                 LOCK(&sock->lock);
2939                 have_lock = ISC_TRUE;
2940
2941                 if (ISC_LIST_EMPTY(sock->send_list))
2942                         io_state = doio_send(sock, dev);
2943                 else
2944                         io_state = DOIO_SOFT;
2945         }
2946
2947         switch (io_state) {
2948         case DOIO_SOFT:
2949                 /*
2950                  * We couldn't send all or part of the request right now, so
2951                  * queue it unless ISC_SOCKFLAG_NORETRY is set.
2952                  */
2953                 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2954                         isc_task_attach(task, &ntask);
2955                         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2956
2957                         if (!have_lock) {
2958                                 LOCK(&sock->lock);
2959                                 have_lock = ISC_TRUE;
2960                         }
2961
2962                         /*
2963                          * Enqueue the request.  If the socket was previously
2964                          * not being watched, poke the watcher to start
2965                          * paying attention to it.
2966                          */
2967                         if (ISC_LIST_EMPTY(sock->send_list))
2968                                 select_poke(sock->manager, sock->fd,
2969                                             SELECT_POKE_WRITE);
2970                         ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2971
2972                         socket_log(sock, NULL, EVENT, NULL, 0, 0,
2973                                    "socket_send: event %p -> task %p",
2974                                    dev, ntask);
2975
2976                         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2977                                 result = ISC_R_INPROGRESS;
2978                         break;
2979                 }
2980
2981         case DOIO_HARD:
2982         case DOIO_SUCCESS:
2983                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2984                         send_senddone_event(sock, &dev);
2985                 break;
2986         }
2987
2988         if (have_lock)
2989                 UNLOCK(&sock->lock);
2990
2991         return (result);
2992 }
2993
2994 isc_result_t
2995 isc_socket_send(isc_socket_t *sock, isc_region_t *region,
2996                 isc_task_t *task, isc_taskaction_t action, const void *arg)
2997 {
2998         /*
2999          * REQUIRE() checking is performed in isc_socket_sendto().
3000          */
3001         return (isc_socket_sendto(sock, region, task, action, arg, NULL,
3002                                   NULL));
3003 }
3004
3005 isc_result_t
3006 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
3007                   isc_task_t *task, isc_taskaction_t action, const void *arg,
3008                   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3009 {
3010         isc_socketevent_t *dev;
3011         isc_socketmgr_t *manager;
3012
3013         REQUIRE(VALID_SOCKET(sock));
3014         REQUIRE(region != NULL);
3015         REQUIRE(task != NULL);
3016         REQUIRE(action != NULL);
3017
3018         manager = sock->manager;
3019         REQUIRE(VALID_MANAGER(manager));
3020
3021         INSIST(sock->bound);
3022
3023         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3024         if (dev == NULL) {
3025                 return (ISC_R_NOMEMORY);
3026         }
3027
3028         dev->region = *region;
3029
3030         return (socket_send(sock, dev, task, address, pktinfo, 0));
3031 }
3032
3033 isc_result_t
3034 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
3035                  isc_task_t *task, isc_taskaction_t action, const void *arg)
3036 {
3037         return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
3038                                    NULL));
3039 }
3040
3041 isc_result_t
3042 isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
3043                    isc_task_t *task, isc_taskaction_t action, const void *arg,
3044                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3045 {
3046         isc_socketevent_t *dev;
3047         isc_socketmgr_t *manager;
3048         unsigned int iocount;
3049         isc_buffer_t *buffer;
3050
3051         REQUIRE(VALID_SOCKET(sock));
3052         REQUIRE(buflist != NULL);
3053         REQUIRE(!ISC_LIST_EMPTY(*buflist));
3054         REQUIRE(task != NULL);
3055         REQUIRE(action != NULL);
3056
3057         manager = sock->manager;
3058         REQUIRE(VALID_MANAGER(manager));
3059
3060         iocount = isc_bufferlist_usedcount(buflist);
3061         REQUIRE(iocount > 0);
3062
3063         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3064         if (dev == NULL) {
3065                 return (ISC_R_NOMEMORY);
3066         }
3067
3068         /*
3069          * Move each buffer from the passed in list to our internal one.
3070          */
3071         buffer = ISC_LIST_HEAD(*buflist);
3072         while (buffer != NULL) {
3073                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
3074                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
3075                 buffer = ISC_LIST_HEAD(*buflist);
3076         }
3077
3078         return (socket_send(sock, dev, task, address, pktinfo, 0));
3079 }
3080
3081 isc_result_t
3082 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
3083                    isc_task_t *task,
3084                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
3085                    isc_socketevent_t *event, unsigned int flags)
3086 {
3087         REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
3088         if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
3089                 REQUIRE(sock->type == isc_sockettype_udp);
3090         event->ev_sender = sock;
3091         event->result = ISC_R_UNEXPECTED;
3092         ISC_LIST_INIT(event->bufferlist);
3093         event->region = *region;
3094         event->n = 0;
3095         event->offset = 0;
3096         event->attributes = 0;
3097
3098         return (socket_send(sock, event, task, address, pktinfo, flags));
3099 }
3100
3101 isc_result_t
3102 isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
3103                 unsigned int options) {
3104         char strbuf[ISC_STRERRORSIZE];
3105         int on = 1;
3106
3107         LOCK(&sock->lock);
3108
3109         INSIST(!sock->bound);
3110
3111         if (sock->pf != sockaddr->type.sa.sa_family) {
3112                 UNLOCK(&sock->lock);
3113                 return (ISC_R_FAMILYMISMATCH);
3114         }
3115         /*
3116          * Only set SO_REUSEADDR when we want a specific port.
3117          */
3118         if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
3119             isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
3120             setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
3121                        sizeof(on)) < 0) {
3122                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3123                                  "setsockopt(%d) %s", sock->fd,
3124                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3125                                                 ISC_MSG_FAILED, "failed"));
3126                 /* Press on... */
3127         }
3128         if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
3129                 UNLOCK(&sock->lock);
3130                 switch (errno) {
3131                 case EACCES:
3132                         return (ISC_R_NOPERM);
3133                 case EADDRNOTAVAIL:
3134                         return (ISC_R_ADDRNOTAVAIL);
3135                 case EADDRINUSE:
3136                         return (ISC_R_ADDRINUSE);
3137                 case EINVAL:
3138                         return (ISC_R_BOUND);
3139                 default:
3140                         isc__strerror(errno, strbuf, sizeof(strbuf));
3141                         UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
3142                                          strbuf);
3143                         return (ISC_R_UNEXPECTED);
3144                 }
3145         }
3146
3147         socket_log(sock, sockaddr, TRACE,
3148                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
3149         sock->bound = 1;
3150
3151         UNLOCK(&sock->lock);
3152         return (ISC_R_SUCCESS);
3153 }
3154
3155 isc_result_t
3156 isc_socket_filter(isc_socket_t *sock, const char *filter) {
3157 #ifdef SO_ACCEPTFILTER
3158         char strbuf[ISC_STRERRORSIZE];
3159         struct accept_filter_arg afa;
3160 #else
3161         UNUSED(sock);
3162         UNUSED(filter);
3163 #endif
3164
3165         REQUIRE(VALID_SOCKET(sock));
3166
3167 #ifdef SO_ACCEPTFILTER
3168         bzero(&afa, sizeof(afa));
3169         strncpy(afa.af_name, filter, sizeof(afa.af_name));
3170         if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
3171                          &afa, sizeof(afa)) == -1) {
3172                 isc__strerror(errno, strbuf, sizeof(strbuf));
3173                 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
3174                            ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
3175                            strbuf);
3176                 return (ISC_R_FAILURE);
3177         }
3178         return (ISC_R_SUCCESS);
3179 #else
3180         return (ISC_R_NOTIMPLEMENTED);
3181 #endif
3182 }
3183
3184 /*
3185  * Set up to listen on a given socket.  We do this by creating an internal
3186  * event that will be dispatched when the socket has read activity.  The
3187  * watcher will send the internal event to the task when there is a new
3188  * connection.
3189  *
3190  * Unlike in read, we don't preallocate a done event here.  Every time there
3191  * is a new connection we'll have to allocate a new one anyway, so we might
3192  * as well keep things simple rather than having to track them.
3193  */
3194 isc_result_t
3195 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
3196         char strbuf[ISC_STRERRORSIZE];
3197
3198         REQUIRE(VALID_SOCKET(sock));
3199
3200         LOCK(&sock->lock);
3201
3202         REQUIRE(!sock->listener);
3203         REQUIRE(sock->bound);
3204         REQUIRE(sock->type == isc_sockettype_tcp);
3205
3206         if (backlog == 0)
3207                 backlog = SOMAXCONN;
3208
3209         if (listen(sock->fd, (int)backlog) < 0) {
3210                 UNLOCK(&sock->lock);
3211                 isc__strerror(errno, strbuf, sizeof(strbuf));
3212
3213                 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
3214
3215                 return (ISC_R_UNEXPECTED);
3216         }
3217
3218         sock->listener = 1;
3219
3220         UNLOCK(&sock->lock);
3221         return (ISC_R_SUCCESS);
3222 }
3223
3224 /*
3225  * This should try to do agressive accept() XXXMLG
3226  */
3227 isc_result_t
3228 isc_socket_accept(isc_socket_t *sock,
3229                   isc_task_t *task, isc_taskaction_t action, const void *arg)
3230 {
3231         isc_socket_newconnev_t *dev;
3232         isc_socketmgr_t *manager;
3233         isc_task_t *ntask = NULL;
3234         isc_socket_t *nsock;
3235         isc_result_t ret;
3236         isc_boolean_t do_poke = ISC_FALSE;
3237
3238         REQUIRE(VALID_SOCKET(sock));
3239         manager = sock->manager;
3240         REQUIRE(VALID_MANAGER(manager));
3241
3242         LOCK(&sock->lock);
3243
3244         REQUIRE(sock->listener);
3245
3246         /*
3247          * Sender field is overloaded here with the task we will be sending
3248          * this event to.  Just before the actual event is delivered the
3249          * actual ev_sender will be touched up to be the socket.
3250          */
3251         dev = (isc_socket_newconnev_t *)
3252                 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
3253                                    action, arg, sizeof(*dev));
3254         if (dev == NULL) {
3255                 UNLOCK(&sock->lock);
3256                 return (ISC_R_NOMEMORY);
3257         }
3258         ISC_LINK_INIT(dev, ev_link);
3259
3260         ret = allocate_socket(manager, sock->type, &nsock);
3261         if (ret != ISC_R_SUCCESS) {
3262                 isc_event_free(ISC_EVENT_PTR(&dev));
3263                 UNLOCK(&sock->lock);
3264                 return (ret);
3265         }
3266
3267         /*
3268          * Attach to socket and to task.
3269          */
3270         isc_task_attach(task, &ntask);
3271         nsock->references++;
3272
3273         dev->ev_sender = ntask;
3274         dev->newsocket = nsock;
3275
3276         /*
3277          * Poke watcher here.  We still have the socket locked, so there
3278          * is no race condition.  We will keep the lock for such a short
3279          * bit of time waking it up now or later won't matter all that much.
3280          */
3281         if (ISC_LIST_EMPTY(sock->accept_list))
3282                 do_poke = ISC_TRUE;
3283
3284         ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
3285
3286         if (do_poke)
3287                 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
3288
3289         UNLOCK(&sock->lock);
3290         return (ISC_R_SUCCESS);
3291 }
3292
3293 isc_result_t
3294 isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
3295                    isc_task_t *task, isc_taskaction_t action, const void *arg)
3296 {
3297         isc_socket_connev_t *dev;
3298         isc_task_t *ntask = NULL;
3299         isc_socketmgr_t *manager;
3300         int cc;
3301         char strbuf[ISC_STRERRORSIZE];
3302
3303         REQUIRE(VALID_SOCKET(sock));
3304         REQUIRE(addr != NULL);
3305         REQUIRE(task != NULL);
3306         REQUIRE(action != NULL);
3307
3308         manager = sock->manager;
3309         REQUIRE(VALID_MANAGER(manager));
3310         REQUIRE(addr != NULL);
3311
3312         if (isc_sockaddr_ismulticast(addr))
3313                 return (ISC_R_MULTICAST);
3314
3315         LOCK(&sock->lock);
3316
3317         REQUIRE(!sock->connecting);
3318
3319         dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
3320                                                         ISC_SOCKEVENT_CONNECT,
3321                                                         action, arg,
3322                                                         sizeof(*dev));
3323         if (dev == NULL) {
3324                 UNLOCK(&sock->lock);
3325                 return (ISC_R_NOMEMORY);
3326         }
3327         ISC_LINK_INIT(dev, ev_link);
3328
3329         /*
3330          * Try to do the connect right away, as there can be only one
3331          * outstanding, and it might happen to complete.
3332          */
3333         sock->address = *addr;
3334         cc = connect(sock->fd, &addr->type.sa, addr->length);
3335         if (cc < 0) {
3336                 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
3337                         goto queue;
3338
3339                 switch (errno) {
3340 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
3341                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
3342                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
3343                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
3344                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
3345                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
3346 #ifdef EHOSTDOWN
3347                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
3348 #endif
3349                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
3350                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
3351                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
3352                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
3353                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
3354 #undef ERROR_MATCH
3355                 }
3356
3357                 sock->connected = 0;
3358
3359                 isc__strerror(errno, strbuf, sizeof(strbuf));
3360                 UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", errno, strbuf);
3361
3362                 UNLOCK(&sock->lock);
3363                 isc_event_free(ISC_EVENT_PTR(&dev));
3364                 return (ISC_R_UNEXPECTED);
3365
3366         err_exit:
3367                 sock->connected = 0;
3368                 isc_task_send(task, ISC_EVENT_PTR(&dev));
3369
3370                 UNLOCK(&sock->lock);
3371                 return (ISC_R_SUCCESS);
3372         }
3373
3374         /*
3375          * If connect completed, fire off the done event.
3376          */
3377         if (cc == 0) {
3378                 sock->connected = 1;
3379                 sock->bound = 1;
3380                 dev->result = ISC_R_SUCCESS;
3381                 isc_task_send(task, ISC_EVENT_PTR(&dev));
3382
3383                 UNLOCK(&sock->lock);
3384                 return (ISC_R_SUCCESS);
3385         }
3386
3387  queue:
3388
3389         /*
3390          * Attach to task.
3391          */
3392         isc_task_attach(task, &ntask);
3393
3394         sock->connecting = 1;
3395
3396         dev->ev_sender = ntask;
3397
3398         /*
3399          * Poke watcher here.  We still have the socket locked, so there
3400          * is no race condition.  We will keep the lock for such a short
3401          * bit of time waking it up now or later won't matter all that much.
3402          */
3403         if (sock->connect_ev == NULL)
3404                 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
3405
3406         sock->connect_ev = dev;
3407
3408         UNLOCK(&sock->lock);
3409         return (ISC_R_SUCCESS);
3410 }
3411
3412 /*
3413  * Called when a socket with a pending connect() finishes.
3414  */
3415 static void
3416 internal_connect(isc_task_t *me, isc_event_t *ev) {
3417         isc_socket_t *sock;
3418         isc_socket_connev_t *dev;
3419         isc_task_t *task;
3420         int cc;
3421         ISC_SOCKADDR_LEN_T optlen;
3422         char strbuf[ISC_STRERRORSIZE];
3423         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
3424
3425         UNUSED(me);
3426         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3427
3428         sock = ev->ev_sender;
3429         INSIST(VALID_SOCKET(sock));
3430
3431         LOCK(&sock->lock);
3432
3433         /*
3434          * When the internal event was sent the reference count was bumped
3435          * to keep the socket around for us.  Decrement the count here.
3436          */
3437         INSIST(sock->references > 0);
3438         sock->references--;
3439         if (sock->references == 0) {
3440                 UNLOCK(&sock->lock);
3441                 destroy(&sock);
3442                 return;
3443         }
3444
3445         /*
3446          * Has this event been canceled?
3447          */
3448         dev = sock->connect_ev;
3449         if (dev == NULL) {
3450                 INSIST(!sock->connecting);
3451                 UNLOCK(&sock->lock);
3452                 return;
3453         }
3454
3455         INSIST(sock->connecting);
3456         sock->connecting = 0;
3457
3458         /*
3459          * Get any possible error status here.
3460          */
3461         optlen = sizeof(cc);
3462         if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
3463                        (void *)&cc, (void *)&optlen) < 0)
3464                 cc = errno;
3465         else
3466                 errno = cc;
3467
3468         if (errno != 0) {
3469                 /*
3470                  * If the error is EAGAIN, just re-select on this
3471                  * fd and pretend nothing strange happened.
3472                  */
3473                 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
3474                         sock->connecting = 1;
3475                         select_poke(sock->manager, sock->fd,
3476                                     SELECT_POKE_CONNECT);
3477                         UNLOCK(&sock->lock);
3478
3479                         return;
3480                 }
3481
3482                 /*
3483                  * Translate other errors into ISC_R_* flavors.
3484                  */
3485                 switch (errno) {
3486 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
3487                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
3488                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
3489                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
3490                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
3491                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
3492 #ifdef EHOSTDOWN
3493                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
3494 #endif
3495                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
3496                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
3497                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
3498                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
3499                         ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
3500                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
3501 #undef ERROR_MATCH
3502                 default:
3503                         dev->result = ISC_R_UNEXPECTED;
3504                         isc_sockaddr_format(&sock->address, peerbuf,
3505                                             sizeof(peerbuf));
3506                         isc__strerror(errno, strbuf, sizeof(strbuf));
3507                         UNEXPECTED_ERROR(__FILE__, __LINE__,
3508                                          "internal_connect: connect(%s) %s",
3509                                          peerbuf, strbuf);
3510                 }
3511         } else {
3512                 dev->result = ISC_R_SUCCESS;
3513                 sock->connected = 1;
3514                 sock->bound = 1;
3515         }
3516
3517         sock->connect_ev = NULL;
3518
3519         UNLOCK(&sock->lock);
3520
3521         task = dev->ev_sender;
3522         dev->ev_sender = sock;
3523         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
3524 }
3525
3526 isc_result_t
3527 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3528         isc_result_t ret;
3529
3530         REQUIRE(VALID_SOCKET(sock));
3531         REQUIRE(addressp != NULL);
3532
3533         LOCK(&sock->lock);
3534
3535         if (sock->connected) {
3536                 *addressp = sock->address;
3537                 ret = ISC_R_SUCCESS;
3538         } else {
3539                 ret = ISC_R_NOTCONNECTED;
3540         }
3541
3542         UNLOCK(&sock->lock);
3543
3544         return (ret);
3545 }
3546
3547 isc_result_t
3548 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3549         ISC_SOCKADDR_LEN_T len;
3550         isc_result_t ret;
3551         char strbuf[ISC_STRERRORSIZE];
3552
3553         REQUIRE(VALID_SOCKET(sock));
3554         REQUIRE(addressp != NULL);
3555
3556         LOCK(&sock->lock);
3557
3558         if (!sock->bound) {
3559                 ret = ISC_R_NOTBOUND;
3560                 goto out;
3561         }
3562
3563         ret = ISC_R_SUCCESS;
3564
3565         len = sizeof(addressp->type);
3566         if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
3567                 isc__strerror(errno, strbuf, sizeof(strbuf));
3568                 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
3569                                  strbuf);
3570                 ret = ISC_R_UNEXPECTED;
3571                 goto out;
3572         }
3573         addressp->length = (unsigned int)len;
3574
3575  out:
3576         UNLOCK(&sock->lock);
3577
3578         return (ret);
3579 }
3580
3581 /*
3582  * Run through the list of events on this socket, and cancel the ones
3583  * queued for task "task" of type "how".  "how" is a bitmask.
3584  */
3585 void
3586 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
3587
3588         REQUIRE(VALID_SOCKET(sock));
3589
3590         /*
3591          * Quick exit if there is nothing to do.  Don't even bother locking
3592          * in this case.
3593          */
3594         if (how == 0)
3595                 return;
3596
3597         LOCK(&sock->lock);
3598
3599         /*
3600          * All of these do the same thing, more or less.
3601          * Each will:
3602          *      o If the internal event is marked as "posted" try to
3603          *        remove it from the task's queue.  If this fails, mark it
3604          *        as canceled instead, and let the task clean it up later.
3605          *      o For each I/O request for that task of that type, post
3606          *        its done event with status of "ISC_R_CANCELED".
3607          *      o Reset any state needed.
3608          */
3609         if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
3610             && !ISC_LIST_EMPTY(sock->recv_list)) {
3611                 isc_socketevent_t      *dev;
3612                 isc_socketevent_t      *next;
3613                 isc_task_t             *current_task;
3614
3615                 dev = ISC_LIST_HEAD(sock->recv_list);
3616
3617                 while (dev != NULL) {
3618                         current_task = dev->ev_sender;
3619                         next = ISC_LIST_NEXT(dev, ev_link);
3620
3621                         if ((task == NULL) || (task == current_task)) {
3622                                 dev->result = ISC_R_CANCELED;
3623                                 send_recvdone_event(sock, &dev);
3624                         }
3625                         dev = next;
3626                 }
3627         }
3628
3629         if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
3630             && !ISC_LIST_EMPTY(sock->send_list)) {
3631                 isc_socketevent_t      *dev;
3632                 isc_socketevent_t      *next;
3633                 isc_task_t             *current_task;
3634
3635                 dev = ISC_LIST_HEAD(sock->send_list);
3636
3637                 while (dev != NULL) {
3638                         current_task = dev->ev_sender;
3639                         next = ISC_LIST_NEXT(dev, ev_link);
3640
3641                         if ((task == NULL) || (task == current_task)) {
3642                                 dev->result = ISC_R_CANCELED;
3643                                 send_senddone_event(sock, &dev);
3644                         }
3645                         dev = next;
3646                 }
3647         }
3648
3649         if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
3650             && !ISC_LIST_EMPTY(sock->accept_list)) {
3651                 isc_socket_newconnev_t *dev;
3652                 isc_socket_newconnev_t *next;
3653                 isc_task_t             *current_task;
3654
3655                 dev = ISC_LIST_HEAD(sock->accept_list);
3656                 while (dev != NULL) {
3657                         current_task = dev->ev_sender;
3658                         next = ISC_LIST_NEXT(dev, ev_link);
3659
3660                         if ((task == NULL) || (task == current_task)) {
3661
3662                                 ISC_LIST_UNLINK(sock->accept_list, dev,
3663                                                 ev_link);
3664
3665                                 dev->newsocket->references--;
3666                                 free_socket(&dev->newsocket);
3667
3668                                 dev->result = ISC_R_CANCELED;
3669                                 dev->ev_sender = sock;
3670                                 isc_task_sendanddetach(&current_task,
3671                                                        ISC_EVENT_PTR(&dev));
3672                         }
3673
3674                         dev = next;
3675                 }
3676         }
3677
3678         /*
3679          * Connecting is not a list.
3680          */
3681         if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
3682             && sock->connect_ev != NULL) {
3683                 isc_socket_connev_t    *dev;
3684                 isc_task_t             *current_task;
3685
3686                 INSIST(sock->connecting);
3687                 sock->connecting = 0;
3688
3689                 dev = sock->connect_ev;
3690                 current_task = dev->ev_sender;
3691
3692                 if ((task == NULL) || (task == current_task)) {
3693                         sock->connect_ev = NULL;
3694
3695                         dev->result = ISC_R_CANCELED;
3696                         dev->ev_sender = sock;
3697                         isc_task_sendanddetach(&current_task,
3698                                                ISC_EVENT_PTR(&dev));
3699                 }
3700         }
3701
3702         UNLOCK(&sock->lock);
3703 }
3704
3705 isc_sockettype_t
3706 isc_socket_gettype(isc_socket_t *sock) {
3707         REQUIRE(VALID_SOCKET(sock));
3708
3709         return (sock->type);
3710 }
3711
3712 isc_boolean_t
3713 isc_socket_isbound(isc_socket_t *sock) {
3714         isc_boolean_t val;
3715
3716         LOCK(&sock->lock);
3717         val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
3718         UNLOCK(&sock->lock);
3719
3720         return (val);
3721 }
3722
3723 void
3724 isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
3725 #if defined(IPV6_V6ONLY)
3726         int onoff = yes ? 1 : 0;
3727 #else
3728         UNUSED(yes);
3729         UNUSED(sock);
3730 #endif
3731
3732         REQUIRE(VALID_SOCKET(sock));
3733
3734 #ifdef IPV6_V6ONLY
3735         if (sock->pf == AF_INET6) {
3736                 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
3737                                  (void *)&onoff, sizeof(onoff));
3738         }
3739 #endif
3740 }
3741
3742 #ifndef ISC_PLATFORM_USETHREADS
3743 void
3744 isc__socketmgr_getfdsets(fd_set **readset, fd_set **writeset, int *maxfd) {
3745         if (socketmgr == NULL)
3746                 *maxfd = 0;
3747         else {
3748                 /* Prepare duplicates of fd_sets, as select() will modify */
3749                 memcpy(socketmgr->read_fds_copy, socketmgr->read_fds,
3750                        socketmgr->fd_bufsize);
3751                 memcpy(socketmgr->write_fds_copy, socketmgr->write_fds,
3752                        socketmgr->fd_bufsize);
3753                 *readset = socketmgr->read_fds_copy;
3754                 *writeset = socketmgr->write_fds_copy;
3755                 *maxfd = socketmgr->maxfd + 1;
3756         }
3757 }
3758
3759 isc_result_t
3760 isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd) {
3761         isc_socketmgr_t *manager = socketmgr;
3762
3763         if (manager == NULL)
3764                 return (ISC_R_NOTFOUND);
3765
3766         process_fds(manager, maxfd, readset, writeset);
3767         return (ISC_R_SUCCESS);
3768 }
3769 #endif /* ISC_PLATFORM_USETHREADS */