]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/bind9/lib/isc/unix/socket.c
Vendor import of BIND 9.4.1
[FreeBSD/FreeBSD.git] / contrib / bind9 / lib / isc / unix / socket.c
1 /*
2  * Copyright (C) 2004-2006  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1998-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: socket.c,v 1.237.18.24 2006/06/06 00:56:09 marka Exp $ */
19
20 /*! \file */
21
22 #include <config.h>
23
24 #include <sys/param.h>
25 #include <sys/types.h>
26 #include <sys/socket.h>
27 #include <sys/stat.h>
28 #ifdef ISC_PLATFORM_HAVESYSUNH
29 #include <sys/un.h>
30 #endif
31 #include <sys/time.h>
32 #include <sys/uio.h>
33
34 #include <errno.h>
35 #include <fcntl.h>
36 #include <stddef.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <unistd.h>
40
41 #include <isc/buffer.h>
42 #include <isc/bufferlist.h>
43 #include <isc/condition.h>
44 #include <isc/formatcheck.h>
45 #include <isc/list.h>
46 #include <isc/log.h>
47 #include <isc/mem.h>
48 #include <isc/msgs.h>
49 #include <isc/mutex.h>
50 #include <isc/net.h>
51 #include <isc/platform.h>
52 #include <isc/print.h>
53 #include <isc/region.h>
54 #include <isc/socket.h>
55 #include <isc/strerror.h>
56 #include <isc/task.h>
57 #include <isc/thread.h>
58 #include <isc/util.h>
59
60 #include "errno2result.h"
61
62 #ifndef ISC_PLATFORM_USETHREADS
63 #include "socket_p.h"
64 #endif /* ISC_PLATFORM_USETHREADS */
65
66 /*%
67  * Some systems define the socket length argument as an int, some as size_t,
68  * some as socklen_t.  This is here so it can be easily changed if needed.
69  */
70 #ifndef ISC_SOCKADDR_LEN_T
71 #define ISC_SOCKADDR_LEN_T unsigned int
72 #endif
73
74 /*%
75  * Define what the possible "soft" errors can be.  These are non-fatal returns
76  * of various network related functions, like recv() and so on.
77  *
78  * For some reason, BSDI (and perhaps others) will sometimes return <0
79  * from recv() but will have errno==0.  This is broken, but we have to
80  * work around it here.
81  */
82 #define SOFT_ERROR(e)   ((e) == EAGAIN || \
83                          (e) == EWOULDBLOCK || \
84                          (e) == EINTR || \
85                          (e) == 0)
86
87 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
88
89 /*!<
90  * DLVL(90)  --  Function entry/exit and other tracing.
91  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
92  * DLVL(60)  --  Socket data send/receive
93  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
94  * DLVL(20)  --  Socket creation/destruction.
95  */
96 #define TRACE_LEVEL             90
97 #define CORRECTNESS_LEVEL       70
98 #define IOEVENT_LEVEL           60
99 #define EVENT_LEVEL             50
100 #define CREATION_LEVEL          20
101
102 #define TRACE           DLVL(TRACE_LEVEL)
103 #define CORRECTNESS     DLVL(CORRECTNESS_LEVEL)
104 #define IOEVENT         DLVL(IOEVENT_LEVEL)
105 #define EVENT           DLVL(EVENT_LEVEL)
106 #define CREATION        DLVL(CREATION_LEVEL)
107
108 typedef isc_event_t intev_t;
109
110 #define SOCKET_MAGIC            ISC_MAGIC('I', 'O', 'i', 'o')
111 #define VALID_SOCKET(t)         ISC_MAGIC_VALID(t, SOCKET_MAGIC)
112
113 /*!
114  * IPv6 control information.  If the socket is an IPv6 socket we want
115  * to collect the destination address and interface so the client can
116  * set them on outgoing packets.
117  */
118 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
119 #ifndef USE_CMSG
120 #define USE_CMSG        1
121 #endif
122 #endif
123
124 /*%
125  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
126  * a setsockopt() like interface to request timestamps, and if the OS
127  * doesn't do it for us, call gettimeofday() on every UDP receive?
128  */
129 #ifdef SO_TIMESTAMP
130 #ifndef USE_CMSG
131 #define USE_CMSG        1
132 #endif
133 #endif
134
135 /*%
136  * The size to raise the recieve buffer to (from BIND 8).
137  */
138 #define RCVBUFSIZE (32*1024)
139
140 /*%
141  * The number of times a send operation is repeated if the result is EINTR.
142  */
143 #define NRETRIES 10
144
145 struct isc_socket {
146         /* Not locked. */
147         unsigned int            magic;
148         isc_socketmgr_t        *manager;
149         isc_mutex_t             lock;
150         isc_sockettype_t        type;
151
152         /* Locked by socket lock. */
153         ISC_LINK(isc_socket_t)  link;
154         unsigned int            references;
155         int                     fd;
156         int                     pf;
157
158         ISC_LIST(isc_socketevent_t)             send_list;
159         ISC_LIST(isc_socketevent_t)             recv_list;
160         ISC_LIST(isc_socket_newconnev_t)        accept_list;
161         isc_socket_connev_t                    *connect_ev;
162
163         /*
164          * Internal events.  Posted when a descriptor is readable or
165          * writable.  These are statically allocated and never freed.
166          * They will be set to non-purgable before use.
167          */
168         intev_t                 readable_ev;
169         intev_t                 writable_ev;
170
171         isc_sockaddr_t          address;  /* remote address */
172
173         unsigned int            pending_recv : 1,
174                                 pending_send : 1,
175                                 pending_accept : 1,
176                                 listener : 1, /* listener socket */
177                                 connected : 1,
178                                 connecting : 1, /* connect pending */
179                                 bound : 1; /* bound to local addr */
180
181 #ifdef ISC_NET_RECVOVERFLOW
182         unsigned char           overflow; /* used for MSG_TRUNC fake */
183 #endif
184
185         char                    *recvcmsgbuf;
186         ISC_SOCKADDR_LEN_T      recvcmsgbuflen;
187         char                    *sendcmsgbuf;
188         ISC_SOCKADDR_LEN_T      sendcmsgbuflen;
189 };
190
191 #define SOCKET_MANAGER_MAGIC    ISC_MAGIC('I', 'O', 'm', 'g')
192 #define VALID_MANAGER(m)        ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
193
194 struct isc_socketmgr {
195         /* Not locked. */
196         unsigned int            magic;
197         isc_mem_t              *mctx;
198         isc_mutex_t             lock;
199         /* Locked by manager lock. */
200         ISC_LIST(isc_socket_t)  socklist;
201         fd_set                  read_fds;
202         fd_set                  write_fds;
203         isc_socket_t           *fds[FD_SETSIZE];
204         int                     fdstate[FD_SETSIZE];
205         int                     maxfd;
206 #ifdef ISC_PLATFORM_USETHREADS
207         isc_thread_t            watcher;
208         isc_condition_t         shutdown_ok;
209         int                     pipe_fds[2];
210 #else /* ISC_PLATFORM_USETHREADS */
211         unsigned int            refs;
212 #endif /* ISC_PLATFORM_USETHREADS */
213 };
214
215 #ifndef ISC_PLATFORM_USETHREADS
216 static isc_socketmgr_t *socketmgr = NULL;
217 #endif /* ISC_PLATFORM_USETHREADS */
218
219 #define CLOSED          0       /* this one must be zero */
220 #define MANAGED         1
221 #define CLOSE_PENDING   2
222
223 /*
224  * send() and recv() iovec counts
225  */
226 #define MAXSCATTERGATHER_SEND   (ISC_SOCKET_MAXSCATTERGATHER)
227 #ifdef ISC_NET_RECVOVERFLOW
228 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER + 1)
229 #else
230 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER)
231 #endif
232
233 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
234 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
235 static void free_socket(isc_socket_t **);
236 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
237                                     isc_socket_t **);
238 static void destroy(isc_socket_t **);
239 static void internal_accept(isc_task_t *, isc_event_t *);
240 static void internal_connect(isc_task_t *, isc_event_t *);
241 static void internal_recv(isc_task_t *, isc_event_t *);
242 static void internal_send(isc_task_t *, isc_event_t *);
243 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
244 static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
245                               struct msghdr *, struct iovec *, size_t *);
246 static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
247                               struct msghdr *, struct iovec *, size_t *);
248
249 #define SELECT_POKE_SHUTDOWN            (-1)
250 #define SELECT_POKE_NOTHING             (-2)
251 #define SELECT_POKE_READ                (-3)
252 #define SELECT_POKE_ACCEPT              (-3) /*%< Same as _READ */
253 #define SELECT_POKE_WRITE               (-4)
254 #define SELECT_POKE_CONNECT             (-4) /*%< Same as _WRITE */
255 #define SELECT_POKE_CLOSE               (-5)
256
257 #define SOCK_DEAD(s)                    ((s)->references == 0)
258
259 static void
260 manager_log(isc_socketmgr_t *sockmgr,
261             isc_logcategory_t *category, isc_logmodule_t *module, int level,
262             const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
263 static void
264 manager_log(isc_socketmgr_t *sockmgr,
265             isc_logcategory_t *category, isc_logmodule_t *module, int level,
266             const char *fmt, ...)
267 {
268         char msgbuf[2048];
269         va_list ap;
270
271         if (! isc_log_wouldlog(isc_lctx, level))
272                 return;
273
274         va_start(ap, fmt);
275         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
276         va_end(ap);
277
278         isc_log_write(isc_lctx, category, module, level,
279                       "sockmgr %p: %s", sockmgr, msgbuf);
280 }
281
282 static void
283 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
284            isc_logcategory_t *category, isc_logmodule_t *module, int level,
285            isc_msgcat_t *msgcat, int msgset, int message,
286            const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
287 static void
288 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
289            isc_logcategory_t *category, isc_logmodule_t *module, int level,
290            isc_msgcat_t *msgcat, int msgset, int message,
291            const char *fmt, ...)
292 {
293         char msgbuf[2048];
294         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
295         va_list ap;
296
297         if (! isc_log_wouldlog(isc_lctx, level))
298                 return;
299
300         va_start(ap, fmt);
301         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
302         va_end(ap);
303
304         if (address == NULL) {
305                 isc_log_iwrite(isc_lctx, category, module, level,
306                                msgcat, msgset, message,
307                                "socket %p: %s", sock, msgbuf);
308         } else {
309                 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
310                 isc_log_iwrite(isc_lctx, category, module, level,
311                                msgcat, msgset, message,
312                                "socket %p %s: %s", sock, peerbuf, msgbuf);
313         }
314 }
315
316 static void
317 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
318         isc_socket_t *sock;
319
320         /*
321          * This is a wakeup on a socket.  If the socket is not in the
322          * process of being closed, start watching it for either reads
323          * or writes.
324          */
325
326         INSIST(fd >= 0 && fd < (int)FD_SETSIZE);
327
328         if (manager->fdstate[fd] == CLOSE_PENDING) {
329                 manager->fdstate[fd] = CLOSED;
330                 FD_CLR(fd, &manager->read_fds);
331                 FD_CLR(fd, &manager->write_fds);
332                 (void)close(fd);
333                 return;
334         }
335         if (manager->fdstate[fd] != MANAGED)
336                 return;
337
338         sock = manager->fds[fd];
339
340         /*
341          * Set requested bit.
342          */
343         if (msg == SELECT_POKE_READ)
344                 FD_SET(sock->fd, &manager->read_fds);
345         if (msg == SELECT_POKE_WRITE)
346                 FD_SET(sock->fd, &manager->write_fds);
347 }
348
349 #ifdef ISC_PLATFORM_USETHREADS
350 /*
351  * Poke the select loop when there is something for us to do.
352  * The write is required (by POSIX) to complete.  That is, we
353  * will not get partial writes.
354  */
355 static void
356 select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
357         int cc;
358         int buf[2];
359         char strbuf[ISC_STRERRORSIZE];
360
361         buf[0] = fd;
362         buf[1] = msg;
363
364         do {
365                 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
366 #ifdef ENOSR
367                 /*
368                  * Treat ENOSR as EAGAIN but loop slowly as it is
369                  * unlikely to clear fast.
370                  */
371                 if (cc < 0 && errno == ENOSR) {
372                         sleep(1);
373                         errno = EAGAIN;
374                 }
375 #endif
376         } while (cc < 0 && SOFT_ERROR(errno));
377
378         if (cc < 0) {
379                 isc__strerror(errno, strbuf, sizeof(strbuf));
380                 FATAL_ERROR(__FILE__, __LINE__,
381                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
382                                            ISC_MSG_WRITEFAILED,
383                                            "write() failed "
384                                            "during watcher poke: %s"),
385                             strbuf);
386         }
387
388         INSIST(cc == sizeof(buf));
389 }
390
391 /*
392  * Read a message on the internal fd.
393  */
394 static void
395 select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
396         int buf[2];
397         int cc;
398         char strbuf[ISC_STRERRORSIZE];
399
400         cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
401         if (cc < 0) {
402                 *msg = SELECT_POKE_NOTHING;
403                 *fd = -1;       /* Silence compiler. */
404                 if (SOFT_ERROR(errno))
405                         return;
406
407                 isc__strerror(errno, strbuf, sizeof(strbuf));
408                 FATAL_ERROR(__FILE__, __LINE__,
409                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
410                                            ISC_MSG_READFAILED,
411                                            "read() failed "
412                                            "during watcher poke: %s"),
413                             strbuf);
414                 
415                 return;
416         }
417         INSIST(cc == sizeof(buf));
418
419         *fd = buf[0];
420         *msg = buf[1];
421 }
422 #else /* ISC_PLATFORM_USETHREADS */
423 /*
424  * Update the state of the socketmgr when something changes.
425  */
426 static void
427 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
428         if (msg == SELECT_POKE_SHUTDOWN)
429                 return;
430         else if (fd >= 0)
431                 wakeup_socket(manager, fd, msg);
432         return;
433 }
434 #endif /* ISC_PLATFORM_USETHREADS */
435
436 /*
437  * Make a fd non-blocking.
438  */
439 static isc_result_t
440 make_nonblock(int fd) {
441         int ret;
442         int flags;
443         char strbuf[ISC_STRERRORSIZE];
444 #ifdef USE_FIONBIO_IOCTL
445         int on = 1;
446
447         ret = ioctl(fd, FIONBIO, (char *)&on);
448 #else
449         flags = fcntl(fd, F_GETFL, 0);
450         flags |= PORT_NONBLOCK;
451         ret = fcntl(fd, F_SETFL, flags);
452 #endif
453
454         if (ret == -1) {
455                 isc__strerror(errno, strbuf, sizeof(strbuf));
456                 UNEXPECTED_ERROR(__FILE__, __LINE__,
457 #ifdef USE_FIONBIO_IOCTL
458                                  "ioctl(%d, FIONBIO, &on): %s", fd,
459 #else
460                                  "fcntl(%d, F_SETFL, %d): %s", fd, flags,
461 #endif
462                                  strbuf);
463
464                 return (ISC_R_UNEXPECTED);
465         }
466
467         return (ISC_R_SUCCESS);
468 }
469
470 #ifdef USE_CMSG
471 /*
472  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
473  * In order to ensure as much portability as possible, we provide wrapper
474  * functions of these macros.
475  * Note that cmsg_space() could run slow on OSes that do not have
476  * CMSG_SPACE.
477  */
478 static inline ISC_SOCKADDR_LEN_T
479 cmsg_len(ISC_SOCKADDR_LEN_T len) {
480 #ifdef CMSG_LEN
481         return (CMSG_LEN(len));
482 #else
483         ISC_SOCKADDR_LEN_T hdrlen;
484
485         /*
486          * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
487          * is correct.
488          */
489         hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
490         return (hdrlen + len);
491 #endif
492 }
493
494 static inline ISC_SOCKADDR_LEN_T
495 cmsg_space(ISC_SOCKADDR_LEN_T len) {
496 #ifdef CMSG_SPACE
497         return (CMSG_SPACE(len));
498 #else
499         struct msghdr msg;
500         struct cmsghdr *cmsgp;
501         /*
502          * XXX: The buffer length is an ad-hoc value, but should be enough
503          * in a practical sense.
504          */
505         char dummybuf[sizeof(struct cmsghdr) + 1024];
506
507         memset(&msg, 0, sizeof(msg));
508         msg.msg_control = dummybuf;
509         msg.msg_controllen = sizeof(dummybuf);
510
511         cmsgp = (struct cmsghdr *)dummybuf;
512         cmsgp->cmsg_len = cmsg_len(len);
513
514         cmsgp = CMSG_NXTHDR(&msg, cmsgp);
515         if (cmsgp != NULL)
516                 return ((char *)cmsgp - (char *)msg.msg_control);
517         else
518                 return (0);
519 #endif  
520 }
521 #endif /* USE_CMSG */
522
523 /*
524  * Process control messages received on a socket.
525  */
526 static void
527 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
528 #ifdef USE_CMSG
529         struct cmsghdr *cmsgp;
530 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
531         struct in6_pktinfo *pktinfop;
532 #endif
533 #ifdef SO_TIMESTAMP
534         struct timeval *timevalp;
535 #endif
536 #endif
537
538         /*
539          * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
540          * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
541          * They are all here, outside of the CPP tests, because it is
542          * more consistent with the usual ISC coding style.
543          */
544         UNUSED(sock);
545         UNUSED(msg);
546         UNUSED(dev);
547
548 #ifdef ISC_NET_BSD44MSGHDR
549
550 #ifdef MSG_TRUNC
551         if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
552                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
553 #endif
554
555 #ifdef MSG_CTRUNC
556         if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
557                 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
558 #endif
559
560 #ifndef USE_CMSG
561         return;
562 #else
563         if (msg->msg_controllen == 0U || msg->msg_control == NULL)
564                 return;
565
566 #ifdef SO_TIMESTAMP
567         timevalp = NULL;
568 #endif
569 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
570         pktinfop = NULL;
571 #endif
572
573         cmsgp = CMSG_FIRSTHDR(msg);
574         while (cmsgp != NULL) {
575                 socket_log(sock, NULL, TRACE,
576                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
577                            "processing cmsg %p", cmsgp);
578
579 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
580                 if (cmsgp->cmsg_level == IPPROTO_IPV6
581                     && cmsgp->cmsg_type == IPV6_PKTINFO) {
582
583                         pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
584                         memcpy(&dev->pktinfo, pktinfop,
585                                sizeof(struct in6_pktinfo));
586                         dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
587                         socket_log(sock, NULL, TRACE,
588                                    isc_msgcat, ISC_MSGSET_SOCKET,
589                                    ISC_MSG_IFRECEIVED,
590                                    "interface received on ifindex %u",
591                                    dev->pktinfo.ipi6_ifindex);
592                         if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
593                                 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;                         
594                         goto next;
595                 }
596 #endif
597
598 #ifdef SO_TIMESTAMP
599                 if (cmsgp->cmsg_level == SOL_SOCKET
600                     && cmsgp->cmsg_type == SCM_TIMESTAMP) {
601                         timevalp = (struct timeval *)CMSG_DATA(cmsgp);
602                         dev->timestamp.seconds = timevalp->tv_sec;
603                         dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
604                         dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
605                         goto next;
606                 }
607 #endif
608
609         next:
610                 cmsgp = CMSG_NXTHDR(msg, cmsgp);
611         }
612 #endif /* USE_CMSG */
613
614 #endif /* ISC_NET_BSD44MSGHDR */
615 }
616
617 /*
618  * Construct an iov array and attach it to the msghdr passed in.  This is
619  * the SEND constructor, which will use the used region of the buffer
620  * (if using a buffer list) or will use the internal region (if a single
621  * buffer I/O is requested).
622  *
623  * Nothing can be NULL, and the done event must list at least one buffer
624  * on the buffer linked list for this function to be meaningful.
625  *
626  * If write_countp != NULL, *write_countp will hold the number of bytes
627  * this transaction can send.
628  */
629 static void
630 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
631                   struct msghdr *msg, struct iovec *iov, size_t *write_countp)
632 {
633         unsigned int iovcount;
634         isc_buffer_t *buffer;
635         isc_region_t used;
636         size_t write_count;
637         size_t skip_count;
638
639         memset(msg, 0, sizeof(*msg));
640
641         if (sock->type == isc_sockettype_udp) {
642                 msg->msg_name = (void *)&dev->address.type.sa;
643                 msg->msg_namelen = dev->address.length;
644         } else {
645                 msg->msg_name = NULL;
646                 msg->msg_namelen = 0;
647         }
648
649         buffer = ISC_LIST_HEAD(dev->bufferlist);
650         write_count = 0;
651         iovcount = 0;
652
653         /*
654          * Single buffer I/O?  Skip what we've done so far in this region.
655          */
656         if (buffer == NULL) {
657                 write_count = dev->region.length - dev->n;
658                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
659                 iov[0].iov_len = write_count;
660                 iovcount = 1;
661
662                 goto config;
663         }
664
665         /*
666          * Multibuffer I/O.
667          * Skip the data in the buffer list that we have already written.
668          */
669         skip_count = dev->n;
670         while (buffer != NULL) {
671                 REQUIRE(ISC_BUFFER_VALID(buffer));
672                 if (skip_count < isc_buffer_usedlength(buffer))
673                         break;
674                 skip_count -= isc_buffer_usedlength(buffer);
675                 buffer = ISC_LIST_NEXT(buffer, link);
676         }
677
678         while (buffer != NULL) {
679                 INSIST(iovcount < MAXSCATTERGATHER_SEND);
680
681                 isc_buffer_usedregion(buffer, &used);
682
683                 if (used.length > 0) {
684                         iov[iovcount].iov_base = (void *)(used.base
685                                                           + skip_count);
686                         iov[iovcount].iov_len = used.length - skip_count;
687                         write_count += (used.length - skip_count);
688                         skip_count = 0;
689                         iovcount++;
690                 }
691                 buffer = ISC_LIST_NEXT(buffer, link);
692         }
693
694         INSIST(skip_count == 0U);
695
696  config:
697         msg->msg_iov = iov;
698         msg->msg_iovlen = iovcount;
699
700 #ifdef ISC_NET_BSD44MSGHDR
701         msg->msg_control = NULL;
702         msg->msg_controllen = 0;
703         msg->msg_flags = 0;
704 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
705         if ((sock->type == isc_sockettype_udp)
706             && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
707                 struct cmsghdr *cmsgp;
708                 struct in6_pktinfo *pktinfop;
709
710                 socket_log(sock, NULL, TRACE,
711                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
712                            "sendto pktinfo data, ifindex %u",
713                            dev->pktinfo.ipi6_ifindex);
714
715                 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
716                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
717                 msg->msg_control = (void *)sock->sendcmsgbuf;
718
719                 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
720                 cmsgp->cmsg_level = IPPROTO_IPV6;
721                 cmsgp->cmsg_type = IPV6_PKTINFO;
722                 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
723                 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
724                 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
725         }
726 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
727 #else /* ISC_NET_BSD44MSGHDR */
728         msg->msg_accrights = NULL;
729         msg->msg_accrightslen = 0;
730 #endif /* ISC_NET_BSD44MSGHDR */
731
732         if (write_countp != NULL)
733                 *write_countp = write_count;
734 }
735
736 /*
737  * Construct an iov array and attach it to the msghdr passed in.  This is
738  * the RECV constructor, which will use the avialable region of the buffer
739  * (if using a buffer list) or will use the internal region (if a single
740  * buffer I/O is requested).
741  *
742  * Nothing can be NULL, and the done event must list at least one buffer
743  * on the buffer linked list for this function to be meaningful.
744  *
745  * If read_countp != NULL, *read_countp will hold the number of bytes
746  * this transaction can receive.
747  */
748 static void
749 build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
750                   struct msghdr *msg, struct iovec *iov, size_t *read_countp)
751 {
752         unsigned int iovcount;
753         isc_buffer_t *buffer;
754         isc_region_t available;
755         size_t read_count;
756
757         memset(msg, 0, sizeof(struct msghdr));
758
759         if (sock->type == isc_sockettype_udp) {
760                 memset(&dev->address, 0, sizeof(dev->address));
761 #ifdef BROKEN_RECVMSG
762                 if (sock->pf == AF_INET) {
763                         msg->msg_name = (void *)&dev->address.type.sin;
764                         msg->msg_namelen = sizeof(dev->address.type.sin6);
765                 } else if (sock->pf == AF_INET6) {
766                         msg->msg_name = (void *)&dev->address.type.sin6;
767                         msg->msg_namelen = sizeof(dev->address.type.sin6);
768 #ifdef ISC_PLATFORM_HAVESYSUNH
769                 } else if (sock->pf == AF_UNIX) {
770                         msg->msg_name = (void *)&dev->address.type.sunix;
771                         msg->msg_namelen = sizeof(dev->address.type.sunix);
772 #endif
773                 } else {
774                         msg->msg_name = (void *)&dev->address.type.sa;
775                         msg->msg_namelen = sizeof(dev->address.type);
776                 }
777 #else
778                 msg->msg_name = (void *)&dev->address.type.sa;
779                 msg->msg_namelen = sizeof(dev->address.type);
780 #endif
781 #ifdef ISC_NET_RECVOVERFLOW
782                 /* If needed, steal one iovec for overflow detection. */
783                 maxiov--;
784 #endif
785         } else { /* TCP */
786                 msg->msg_name = NULL;
787                 msg->msg_namelen = 0;
788                 dev->address = sock->address;
789         }
790
791         buffer = ISC_LIST_HEAD(dev->bufferlist);
792         read_count = 0;
793
794         /*
795          * Single buffer I/O?  Skip what we've done so far in this region.
796          */
797         if (buffer == NULL) {
798                 read_count = dev->region.length - dev->n;
799                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
800                 iov[0].iov_len = read_count;
801                 iovcount = 1;
802
803                 goto config;
804         }
805
806         /*
807          * Multibuffer I/O.
808          * Skip empty buffers.
809          */
810         while (buffer != NULL) {
811                 REQUIRE(ISC_BUFFER_VALID(buffer));
812                 if (isc_buffer_availablelength(buffer) != 0)
813                         break;
814                 buffer = ISC_LIST_NEXT(buffer, link);
815         }
816
817         iovcount = 0;
818         while (buffer != NULL) {
819                 INSIST(iovcount < MAXSCATTERGATHER_RECV);
820
821                 isc_buffer_availableregion(buffer, &available);
822
823                 if (available.length > 0) {
824                         iov[iovcount].iov_base = (void *)(available.base);
825                         iov[iovcount].iov_len = available.length;
826                         read_count += available.length;
827                         iovcount++;
828                 }
829                 buffer = ISC_LIST_NEXT(buffer, link);
830         }
831
832  config:
833
834         /*
835          * If needed, set up to receive that one extra byte.  Note that
836          * we know there is at least one iov left, since we stole it
837          * at the top of this function.
838          */
839 #ifdef ISC_NET_RECVOVERFLOW
840         if (sock->type == isc_sockettype_udp) {
841                 iov[iovcount].iov_base = (void *)(&sock->overflow);
842                 iov[iovcount].iov_len = 1;
843                 iovcount++;
844         }
845 #endif
846
847         msg->msg_iov = iov;
848         msg->msg_iovlen = iovcount;
849
850 #ifdef ISC_NET_BSD44MSGHDR
851         msg->msg_control = NULL;
852         msg->msg_controllen = 0;
853         msg->msg_flags = 0;
854 #if defined(USE_CMSG)
855         if (sock->type == isc_sockettype_udp) {
856                 msg->msg_control = sock->recvcmsgbuf;
857                 msg->msg_controllen = sock->recvcmsgbuflen;
858         }
859 #endif /* USE_CMSG */
860 #else /* ISC_NET_BSD44MSGHDR */
861         msg->msg_accrights = NULL;
862         msg->msg_accrightslen = 0;
863 #endif /* ISC_NET_BSD44MSGHDR */
864
865         if (read_countp != NULL)
866                 *read_countp = read_count;
867 }
868
869 static void
870 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
871                 isc_socketevent_t *dev)
872 {
873         if (sock->type == isc_sockettype_udp) {
874                 if (address != NULL)
875                         dev->address = *address;
876                 else
877                         dev->address = sock->address;
878         } else if (sock->type == isc_sockettype_tcp) {
879                 INSIST(address == NULL);
880                 dev->address = sock->address;
881         }
882 }
883
884 static void
885 destroy_socketevent(isc_event_t *event) {
886         isc_socketevent_t *ev = (isc_socketevent_t *)event;
887
888         INSIST(ISC_LIST_EMPTY(ev->bufferlist));
889
890         (ev->destroy)(event);
891 }
892
893 static isc_socketevent_t *
894 allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
895                      isc_taskaction_t action, const void *arg)
896 {
897         isc_socketevent_t *ev;
898
899         ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
900                                                      sock, eventtype,
901                                                      action, arg,
902                                                      sizeof(*ev));
903
904         if (ev == NULL)
905                 return (NULL);
906
907         ev->result = ISC_R_UNEXPECTED;
908         ISC_LINK_INIT(ev, ev_link);
909         ISC_LIST_INIT(ev->bufferlist);
910         ev->region.base = NULL;
911         ev->n = 0;
912         ev->offset = 0;
913         ev->attributes = 0;
914         ev->destroy = ev->ev_destroy;
915         ev->ev_destroy = destroy_socketevent;
916
917         return (ev);
918 }
919
920 #if defined(ISC_SOCKET_DEBUG)
921 static void
922 dump_msg(struct msghdr *msg) {
923         unsigned int i;
924
925         printf("MSGHDR %p\n", msg);
926         printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
927         printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
928         for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
929                 printf("\t\t%d\tbase %p, len %d\n", i,
930                        msg->msg_iov[i].iov_base,
931                        msg->msg_iov[i].iov_len);
932 #ifdef ISC_NET_BSD44MSGHDR
933         printf("\tcontrol %p, controllen %d\n", msg->msg_control,
934                msg->msg_controllen);
935 #endif
936 }
937 #endif
938
939 #define DOIO_SUCCESS            0       /* i/o ok, event sent */
940 #define DOIO_SOFT               1       /* i/o ok, soft error, no event sent */
941 #define DOIO_HARD               2       /* i/o error, event sent */
942 #define DOIO_EOF                3       /* EOF, no event sent */
943
944 static int
945 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
946         int cc;
947         struct iovec iov[MAXSCATTERGATHER_RECV];
948         size_t read_count;
949         size_t actual_count;
950         struct msghdr msghdr;
951         isc_buffer_t *buffer;
952         int recv_errno;
953         char strbuf[ISC_STRERRORSIZE];
954
955         build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
956
957 #if defined(ISC_SOCKET_DEBUG)
958         dump_msg(&msghdr);
959 #endif
960
961         cc = recvmsg(sock->fd, &msghdr, 0);
962         recv_errno = errno;
963
964 #if defined(ISC_SOCKET_DEBUG)
965         dump_msg(&msghdr);
966 #endif
967
968         if (cc < 0) {
969                 if (SOFT_ERROR(recv_errno))
970                         return (DOIO_SOFT);
971
972                 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
973                         isc__strerror(recv_errno, strbuf, sizeof(strbuf));
974                         socket_log(sock, NULL, IOEVENT,
975                                    isc_msgcat, ISC_MSGSET_SOCKET,
976                                    ISC_MSG_DOIORECV, 
977                                   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
978                                    sock->fd, cc, recv_errno, strbuf);
979                 }
980
981 #define SOFT_OR_HARD(_system, _isc) \
982         if (recv_errno == _system) { \
983                 if (sock->connected) { \
984                         dev->result = _isc; \
985                         return (DOIO_HARD); \
986                 } \
987                 return (DOIO_SOFT); \
988         }
989 #define ALWAYS_HARD(_system, _isc) \
990         if (recv_errno == _system) { \
991                 dev->result = _isc; \
992                 return (DOIO_HARD); \
993         }
994
995                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
996                 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
997                 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
998                 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
999                 /* HPUX 11.11 can return EADDRNOTAVAIL. */
1000                 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1001                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1002
1003 #undef SOFT_OR_HARD
1004 #undef ALWAYS_HARD
1005
1006                 dev->result = isc__errno2result(recv_errno);
1007                 return (DOIO_HARD);
1008         }
1009
1010         /*
1011          * On TCP, zero length reads indicate EOF, while on
1012          * UDP, zero length reads are perfectly valid, although
1013          * strange.
1014          */
1015         if ((sock->type == isc_sockettype_tcp) && (cc == 0))
1016                 return (DOIO_EOF);
1017
1018         if (sock->type == isc_sockettype_udp) {
1019                 dev->address.length = msghdr.msg_namelen;
1020                 if (isc_sockaddr_getport(&dev->address) == 0) {
1021                         if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1022                                 socket_log(sock, &dev->address, IOEVENT,
1023                                            isc_msgcat, ISC_MSGSET_SOCKET,
1024                                            ISC_MSG_ZEROPORT, 
1025                                            "dropping source port zero packet");
1026                         }
1027                         return (DOIO_SOFT);
1028                 }
1029         }
1030
1031         socket_log(sock, &dev->address, IOEVENT,
1032                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1033                    "packet received correctly");
1034
1035         /*
1036          * Overflow bit detection.  If we received MORE bytes than we should,
1037          * this indicates an overflow situation.  Set the flag in the
1038          * dev entry and adjust how much we read by one.
1039          */
1040 #ifdef ISC_NET_RECVOVERFLOW
1041         if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1042                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1043                 cc--;
1044         }
1045 #endif
1046
1047         /*
1048          * If there are control messages attached, run through them and pull
1049          * out the interesting bits.
1050          */
1051         if (sock->type == isc_sockettype_udp)
1052                 process_cmsg(sock, &msghdr, dev);
1053
1054         /*
1055          * update the buffers (if any) and the i/o count
1056          */
1057         dev->n += cc;
1058         actual_count = cc;
1059         buffer = ISC_LIST_HEAD(dev->bufferlist);
1060         while (buffer != NULL && actual_count > 0U) {
1061                 REQUIRE(ISC_BUFFER_VALID(buffer));
1062                 if (isc_buffer_availablelength(buffer) <= actual_count) {
1063                         actual_count -= isc_buffer_availablelength(buffer);
1064                         isc_buffer_add(buffer,
1065                                        isc_buffer_availablelength(buffer));
1066                 } else {
1067                         isc_buffer_add(buffer, actual_count);
1068                         actual_count = 0;
1069                         break;
1070                 }
1071                 buffer = ISC_LIST_NEXT(buffer, link);
1072                 if (buffer == NULL) {
1073                         INSIST(actual_count == 0U);
1074                 }
1075         }
1076
1077         /*
1078          * If we read less than we expected, update counters,
1079          * and let the upper layer poke the descriptor.
1080          */
1081         if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1082                 return (DOIO_SOFT);
1083
1084         /*
1085          * Full reads are posted, or partials if partials are ok.
1086          */
1087         dev->result = ISC_R_SUCCESS;
1088         return (DOIO_SUCCESS);
1089 }
1090
1091 /*
1092  * Returns:
1093  *      DOIO_SUCCESS    The operation succeeded.  dev->result contains
1094  *                      ISC_R_SUCCESS.
1095  *
1096  *      DOIO_HARD       A hard or unexpected I/O error was encountered.
1097  *                      dev->result contains the appropriate error.
1098  *
1099  *      DOIO_SOFT       A soft I/O error was encountered.  No senddone
1100  *                      event was sent.  The operation should be retried.
1101  *
1102  *      No other return values are possible.
1103  */
1104 static int
1105 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1106         int cc;
1107         struct iovec iov[MAXSCATTERGATHER_SEND];
1108         size_t write_count;
1109         struct msghdr msghdr;
1110         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1111         int attempts = 0;
1112         int send_errno;
1113         char strbuf[ISC_STRERRORSIZE];
1114
1115         build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1116
1117  resend:
1118         cc = sendmsg(sock->fd, &msghdr, 0);
1119         send_errno = errno;
1120
1121         /*
1122          * Check for error or block condition.
1123          */
1124         if (cc < 0) {
1125                 if (send_errno == EINTR && ++attempts < NRETRIES)
1126                         goto resend;
1127
1128                 if (SOFT_ERROR(send_errno))
1129                         return (DOIO_SOFT);
1130
1131 #define SOFT_OR_HARD(_system, _isc) \
1132         if (send_errno == _system) { \
1133                 if (sock->connected) { \
1134                         dev->result = _isc; \
1135                         return (DOIO_HARD); \
1136                 } \
1137                 return (DOIO_SOFT); \
1138         }
1139 #define ALWAYS_HARD(_system, _isc) \
1140         if (send_errno == _system) { \
1141                 dev->result = _isc; \
1142                 return (DOIO_HARD); \
1143         }
1144
1145                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1146                 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1147                 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1148                 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1149                 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1150 #ifdef EHOSTDOWN
1151                 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1152 #endif
1153                 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1154                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1155                 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1156                 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1157                 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1158
1159 #undef SOFT_OR_HARD
1160 #undef ALWAYS_HARD
1161
1162                 /*
1163                  * The other error types depend on whether or not the
1164                  * socket is UDP or TCP.  If it is UDP, some errors
1165                  * that we expect to be fatal under TCP are merely
1166                  * annoying, and are really soft errors.
1167                  *
1168                  * However, these soft errors are still returned as
1169                  * a status.
1170                  */
1171                 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1172                 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1173                 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1174                                  addrbuf, strbuf);
1175                 dev->result = isc__errno2result(send_errno);
1176                 return (DOIO_HARD);
1177         }
1178
1179         if (cc == 0)
1180                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1181                                  "internal_send: send() %s 0",
1182                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1183                                                 ISC_MSG_RETURNED, "returned"));
1184
1185         /*
1186          * If we write less than we expected, update counters, poke.
1187          */
1188         dev->n += cc;
1189         if ((size_t)cc != write_count)
1190                 return (DOIO_SOFT);
1191
1192         /*
1193          * Exactly what we wanted to write.  We're done with this
1194          * entry.  Post its completion event.
1195          */
1196         dev->result = ISC_R_SUCCESS;
1197         return (DOIO_SUCCESS);
1198 }
1199
1200 /*
1201  * Kill.
1202  *
1203  * Caller must ensure that the socket is not locked and no external
1204  * references exist.
1205  */
1206 static void
1207 destroy(isc_socket_t **sockp) {
1208         isc_socket_t *sock = *sockp;
1209         isc_socketmgr_t *manager = sock->manager;
1210
1211         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1212                    ISC_MSG_DESTROYING, "destroying");
1213
1214         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1215         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1216         INSIST(ISC_LIST_EMPTY(sock->send_list));
1217         INSIST(sock->connect_ev == NULL);
1218         REQUIRE(sock->fd >= 0 && sock->fd < (int)FD_SETSIZE);
1219
1220         LOCK(&manager->lock);
1221
1222         /*
1223          * No one has this socket open, so the watcher doesn't have to be
1224          * poked, and the socket doesn't have to be locked.
1225          */
1226         manager->fds[sock->fd] = NULL;
1227         manager->fdstate[sock->fd] = CLOSE_PENDING;
1228         select_poke(manager, sock->fd, SELECT_POKE_CLOSE);
1229         ISC_LIST_UNLINK(manager->socklist, sock, link);
1230
1231 #ifdef ISC_PLATFORM_USETHREADS
1232         if (ISC_LIST_EMPTY(manager->socklist))
1233                 SIGNAL(&manager->shutdown_ok);
1234 #endif /* ISC_PLATFORM_USETHREADS */
1235
1236         /*
1237          * XXX should reset manager->maxfd here
1238          */
1239
1240         UNLOCK(&manager->lock);
1241
1242         free_socket(sockp);
1243 }
1244
1245 static isc_result_t
1246 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1247                 isc_socket_t **socketp)
1248 {
1249         isc_socket_t *sock;
1250         isc_result_t result;
1251         ISC_SOCKADDR_LEN_T cmsgbuflen;
1252
1253         sock = isc_mem_get(manager->mctx, sizeof(*sock));
1254
1255         if (sock == NULL)
1256                 return (ISC_R_NOMEMORY);
1257
1258         result = ISC_R_UNEXPECTED;
1259
1260         sock->magic = 0;
1261         sock->references = 0;
1262
1263         sock->manager = manager;
1264         sock->type = type;
1265         sock->fd = -1;
1266
1267         ISC_LINK_INIT(sock, link);
1268
1269         sock->recvcmsgbuf = NULL;
1270         sock->sendcmsgbuf = NULL;
1271
1272         /*
1273          * set up cmsg buffers
1274          */
1275         cmsgbuflen = 0;
1276 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1277         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1278 #endif
1279 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
1280         cmsgbuflen += cmsg_space(sizeof(struct timeval));
1281 #endif
1282         sock->recvcmsgbuflen = cmsgbuflen;
1283         if (sock->recvcmsgbuflen != 0U) {
1284                 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1285                 if (sock->recvcmsgbuf == NULL)
1286                         goto error;
1287         }
1288
1289         cmsgbuflen = 0;
1290 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1291         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1292 #endif
1293         sock->sendcmsgbuflen = cmsgbuflen;
1294         if (sock->sendcmsgbuflen != 0U) {
1295                 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1296                 if (sock->sendcmsgbuf == NULL)
1297                         goto error;
1298         }
1299
1300         /*
1301          * set up list of readers and writers to be initially empty
1302          */
1303         ISC_LIST_INIT(sock->recv_list);
1304         ISC_LIST_INIT(sock->send_list);
1305         ISC_LIST_INIT(sock->accept_list);
1306         sock->connect_ev = NULL;
1307         sock->pending_recv = 0;
1308         sock->pending_send = 0;
1309         sock->pending_accept = 0;
1310         sock->listener = 0;
1311         sock->connected = 0;
1312         sock->connecting = 0;
1313         sock->bound = 0;
1314
1315         /*
1316          * initialize the lock
1317          */
1318         result = isc_mutex_init(&sock->lock);
1319         if (result != ISC_R_SUCCESS) {
1320                 sock->magic = 0;
1321                 goto error;
1322         }
1323
1324         /*
1325          * Initialize readable and writable events
1326          */
1327         ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1328                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1329                        NULL, sock, sock, NULL, NULL);
1330         ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1331                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1332                        NULL, sock, sock, NULL, NULL);
1333
1334         sock->magic = SOCKET_MAGIC;
1335         *socketp = sock;
1336
1337         return (ISC_R_SUCCESS);
1338
1339  error:
1340         if (sock->recvcmsgbuf != NULL)
1341                 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
1342                             sock->recvcmsgbuflen);
1343         if (sock->sendcmsgbuf != NULL)
1344                 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
1345                             sock->sendcmsgbuflen);
1346         isc_mem_put(manager->mctx, sock, sizeof(*sock));
1347
1348         return (result);
1349 }
1350
1351 /*
1352  * This event requires that the various lists be empty, that the reference
1353  * count be 1, and that the magic number is valid.  The other socket bits,
1354  * like the lock, must be initialized as well.  The fd associated must be
1355  * marked as closed, by setting it to -1 on close, or this routine will
1356  * also close the socket.
1357  */
1358 static void
1359 free_socket(isc_socket_t **socketp) {
1360         isc_socket_t *sock = *socketp;
1361
1362         INSIST(sock->references == 0);
1363         INSIST(VALID_SOCKET(sock));
1364         INSIST(!sock->connecting);
1365         INSIST(!sock->pending_recv);
1366         INSIST(!sock->pending_send);
1367         INSIST(!sock->pending_accept);
1368         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1369         INSIST(ISC_LIST_EMPTY(sock->send_list));
1370         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1371         INSIST(!ISC_LINK_LINKED(sock, link));
1372
1373         if (sock->recvcmsgbuf != NULL)
1374                 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
1375                             sock->recvcmsgbuflen);
1376         if (sock->sendcmsgbuf != NULL)
1377                 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
1378                             sock->sendcmsgbuflen);
1379
1380         sock->magic = 0;
1381
1382         DESTROYLOCK(&sock->lock);
1383
1384         isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1385
1386         *socketp = NULL;
1387 }
1388
1389 /*
1390  * Create a new 'type' socket managed by 'manager'.  Events
1391  * will be posted to 'task' and when dispatched 'action' will be
1392  * called with 'arg' as the arg value.  The new socket is returned
1393  * in 'socketp'.
1394  */
1395 isc_result_t
1396 isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1397                   isc_socket_t **socketp)
1398 {
1399         isc_socket_t *sock = NULL;
1400         isc_result_t result;
1401 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
1402         int on = 1;
1403 #endif
1404 #if defined(SO_RCVBUF)
1405         ISC_SOCKADDR_LEN_T optlen;
1406         int size;
1407 #endif
1408         char strbuf[ISC_STRERRORSIZE];
1409         const char *err = "socket";
1410
1411         REQUIRE(VALID_MANAGER(manager));
1412         REQUIRE(socketp != NULL && *socketp == NULL);
1413
1414         result = allocate_socket(manager, type, &sock);
1415         if (result != ISC_R_SUCCESS)
1416                 return (result);
1417
1418         sock->pf = pf;
1419         switch (type) {
1420         case isc_sockettype_udp:
1421                 sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1422                 break;
1423         case isc_sockettype_tcp:
1424                 sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1425                 break;
1426         case isc_sockettype_unix:
1427                 sock->fd = socket(pf, SOCK_STREAM, 0);
1428                 break;
1429         }
1430
1431 #ifdef F_DUPFD
1432         /*
1433          * Leave a space for stdio to work in.
1434          */
1435         if (sock->fd >= 0 && sock->fd < 20) {
1436                 int new, tmp;
1437                 new = fcntl(sock->fd, F_DUPFD, 20);
1438                 tmp = errno;
1439                 (void)close(sock->fd);
1440                 errno = tmp;
1441                 sock->fd = new;
1442                 err = "isc_socket_create: fcntl";
1443         }
1444 #endif
1445
1446         if (sock->fd >= (int)FD_SETSIZE) {
1447                 (void)close(sock->fd);
1448                 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1449                                ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1450                                isc_msgcat, ISC_MSGSET_SOCKET,
1451                                ISC_MSG_TOOMANYFDS,
1452                                "%s: too many open file descriptors", "socket");
1453                 free_socket(&sock);
1454                 return (ISC_R_NORESOURCES);
1455         }
1456         
1457         if (sock->fd < 0) {
1458                 free_socket(&sock);
1459
1460                 switch (errno) {
1461                 case EMFILE:
1462                 case ENFILE:
1463                 case ENOBUFS:
1464                         return (ISC_R_NORESOURCES);
1465
1466                 case EPROTONOSUPPORT:
1467                 case EPFNOSUPPORT:
1468                 case EAFNOSUPPORT:
1469                 /*
1470                  * Linux 2.2 (and maybe others) return EINVAL instead of
1471                  * EAFNOSUPPORT.
1472                  */
1473                 case EINVAL:
1474                         return (ISC_R_FAMILYNOSUPPORT);
1475
1476                 default:
1477                         isc__strerror(errno, strbuf, sizeof(strbuf));
1478                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1479                                          "%s() %s: %s", err,
1480                                          isc_msgcat_get(isc_msgcat,
1481                                                         ISC_MSGSET_GENERAL,
1482                                                         ISC_MSG_FAILED,
1483                                                         "failed"),
1484                                          strbuf);
1485                         return (ISC_R_UNEXPECTED);
1486                 }
1487         }
1488
1489         if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
1490                 (void)close(sock->fd);
1491                 free_socket(&sock);
1492                 return (ISC_R_UNEXPECTED);
1493         }
1494
1495 #ifdef SO_BSDCOMPAT
1496         if (type != isc_sockettype_unix &&
1497             setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
1498                        (void *)&on, sizeof(on)) < 0) {
1499                 isc__strerror(errno, strbuf, sizeof(strbuf));
1500                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1501                                  "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
1502                                  sock->fd,
1503                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1504                                                 ISC_MSG_FAILED, "failed"),
1505                                  strbuf);
1506                 /* Press on... */
1507         }
1508 #endif
1509
1510 #if defined(USE_CMSG) || defined(SO_RCVBUF)
1511         if (type == isc_sockettype_udp) {
1512
1513 #if defined(USE_CMSG)
1514 #if defined(SO_TIMESTAMP)
1515                 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1516                                (void *)&on, sizeof(on)) < 0
1517                     && errno != ENOPROTOOPT) {
1518                         isc__strerror(errno, strbuf, sizeof(strbuf));
1519                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1520                                          "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1521                                          sock->fd, 
1522                                          isc_msgcat_get(isc_msgcat,
1523                                                         ISC_MSGSET_GENERAL,
1524                                                         ISC_MSG_FAILED,
1525                                                         "failed"),
1526                                          strbuf);
1527                         /* Press on... */
1528                 }
1529 #endif /* SO_TIMESTAMP */
1530
1531 #if defined(ISC_PLATFORM_HAVEIPV6)
1532                 if (pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
1533                         /*
1534                          * Warn explicitly because this anomaly can be hidden
1535                          * in usual operation (and unexpectedly appear later).
1536                          */
1537                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1538                                          "No buffer available to receive "
1539                                          "IPv6 destination");
1540                 }
1541 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1542 #ifdef IPV6_RECVPKTINFO
1543                 /* 2292bis */
1544                 if ((pf == AF_INET6)
1545                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1546                                    (void *)&on, sizeof(on)) < 0)) {
1547                         isc__strerror(errno, strbuf, sizeof(strbuf));
1548                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1549                                          "setsockopt(%d, IPV6_RECVPKTINFO) "
1550                                          "%s: %s", sock->fd,
1551                                          isc_msgcat_get(isc_msgcat,
1552                                                         ISC_MSGSET_GENERAL,
1553                                                         ISC_MSG_FAILED,
1554                                                         "failed"),
1555                                          strbuf);
1556                 }
1557 #else
1558                 /* 2292 */
1559                 if ((pf == AF_INET6)
1560                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1561                                    (void *)&on, sizeof(on)) < 0)) {
1562                         isc__strerror(errno, strbuf, sizeof(strbuf));
1563                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1564                                          "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1565                                          sock->fd,
1566                                          isc_msgcat_get(isc_msgcat,
1567                                                         ISC_MSGSET_GENERAL,
1568                                                         ISC_MSG_FAILED,
1569                                                         "failed"),
1570                                          strbuf);
1571                 }
1572 #endif /* IPV6_RECVPKTINFO */
1573 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
1574 #ifdef IPV6_USE_MIN_MTU        /*2292bis, not too common yet*/
1575                 /* use minimum MTU */
1576                 if (pf == AF_INET6) {
1577                         (void)setsockopt(sock->fd, IPPROTO_IPV6,
1578                                          IPV6_USE_MIN_MTU,
1579                                          (void *)&on, sizeof(on));
1580                 }
1581 #endif
1582 #endif /* ISC_PLATFORM_HAVEIPV6 */
1583 #endif /* defined(USE_CMSG) */
1584
1585 #if defined(SO_RCVBUF)
1586                 optlen = sizeof(size);
1587                 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1588                                (void *)&size, &optlen) >= 0 &&
1589                      size < RCVBUFSIZE) {
1590                         size = RCVBUFSIZE;
1591                         if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1592                                        (void *)&size, sizeof(size)) == -1) {
1593                                 isc__strerror(errno, strbuf, sizeof(strbuf));
1594                                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1595                                         "setsockopt(%d, SO_RCVBUF, %d) %s: %s",
1596                                         sock->fd, size,
1597                                         isc_msgcat_get(isc_msgcat,
1598                                                        ISC_MSGSET_GENERAL,
1599                                                        ISC_MSG_FAILED,
1600                                                        "failed"),
1601                                         strbuf);
1602                         }
1603                 }
1604 #endif
1605         }
1606 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
1607
1608         sock->references = 1;
1609         *socketp = sock;
1610
1611         LOCK(&manager->lock);
1612
1613         /*
1614          * Note we don't have to lock the socket like we normally would because
1615          * there are no external references to it yet.
1616          */
1617
1618         manager->fds[sock->fd] = sock;
1619         manager->fdstate[sock->fd] = MANAGED;
1620         ISC_LIST_APPEND(manager->socklist, sock, link);
1621         if (manager->maxfd < sock->fd)
1622                 manager->maxfd = sock->fd;
1623
1624         UNLOCK(&manager->lock);
1625
1626         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1627                    ISC_MSG_CREATED, "created");
1628
1629         return (ISC_R_SUCCESS);
1630 }
1631
1632 /*
1633  * Attach to a socket.  Caller must explicitly detach when it is done.
1634  */
1635 void
1636 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1637         REQUIRE(VALID_SOCKET(sock));
1638         REQUIRE(socketp != NULL && *socketp == NULL);
1639
1640         LOCK(&sock->lock);
1641         sock->references++;
1642         UNLOCK(&sock->lock);
1643
1644         *socketp = sock;
1645 }
1646
1647 /*
1648  * Dereference a socket.  If this is the last reference to it, clean things
1649  * up by destroying the socket.
1650  */
1651 void
1652 isc_socket_detach(isc_socket_t **socketp) {
1653         isc_socket_t *sock;
1654         isc_boolean_t kill_socket = ISC_FALSE;
1655
1656         REQUIRE(socketp != NULL);
1657         sock = *socketp;
1658         REQUIRE(VALID_SOCKET(sock));
1659
1660         LOCK(&sock->lock);
1661         REQUIRE(sock->references > 0);
1662         sock->references--;
1663         if (sock->references == 0)
1664                 kill_socket = ISC_TRUE;
1665         UNLOCK(&sock->lock);
1666
1667         if (kill_socket)
1668                 destroy(&sock);
1669
1670         *socketp = NULL;
1671 }
1672
1673 /*
1674  * I/O is possible on a given socket.  Schedule an event to this task that
1675  * will call an internal function to do the I/O.  This will charge the
1676  * task with the I/O operation and let our select loop handler get back
1677  * to doing something real as fast as possible.
1678  *
1679  * The socket and manager must be locked before calling this function.
1680  */
1681 static void
1682 dispatch_recv(isc_socket_t *sock) {
1683         intev_t *iev;
1684         isc_socketevent_t *ev;
1685
1686         INSIST(!sock->pending_recv);
1687
1688         ev = ISC_LIST_HEAD(sock->recv_list);
1689         if (ev == NULL)
1690                 return;
1691
1692         sock->pending_recv = 1;
1693         iev = &sock->readable_ev;
1694
1695         socket_log(sock, NULL, EVENT, NULL, 0, 0,
1696                    "dispatch_recv:  event %p -> task %p", ev, ev->ev_sender);
1697
1698         sock->references++;
1699         iev->ev_sender = sock;
1700         iev->ev_action = internal_recv;
1701         iev->ev_arg = sock;
1702
1703         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1704 }
1705
1706 static void
1707 dispatch_send(isc_socket_t *sock) {
1708         intev_t *iev;
1709         isc_socketevent_t *ev;
1710
1711         INSIST(!sock->pending_send);
1712
1713         ev = ISC_LIST_HEAD(sock->send_list);
1714         if (ev == NULL)
1715                 return;
1716
1717         sock->pending_send = 1;
1718         iev = &sock->writable_ev;
1719
1720         socket_log(sock, NULL, EVENT, NULL, 0, 0,
1721                    "dispatch_send:  event %p -> task %p", ev, ev->ev_sender);
1722
1723         sock->references++;
1724         iev->ev_sender = sock;
1725         iev->ev_action = internal_send;
1726         iev->ev_arg = sock;
1727
1728         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1729 }
1730
1731 /*
1732  * Dispatch an internal accept event.
1733  */
1734 static void
1735 dispatch_accept(isc_socket_t *sock) {
1736         intev_t *iev;
1737         isc_socket_newconnev_t *ev;
1738
1739         INSIST(!sock->pending_accept);
1740
1741         /*
1742          * Are there any done events left, or were they all canceled
1743          * before the manager got the socket lock?
1744          */
1745         ev = ISC_LIST_HEAD(sock->accept_list);
1746         if (ev == NULL)
1747                 return;
1748
1749         sock->pending_accept = 1;
1750         iev = &sock->readable_ev;
1751
1752         sock->references++;  /* keep socket around for this internal event */
1753         iev->ev_sender = sock;
1754         iev->ev_action = internal_accept;
1755         iev->ev_arg = sock;
1756
1757         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1758 }
1759
1760 static void
1761 dispatch_connect(isc_socket_t *sock) {
1762         intev_t *iev;
1763         isc_socket_connev_t *ev;
1764
1765         iev = &sock->writable_ev;
1766
1767         ev = sock->connect_ev;
1768         INSIST(ev != NULL); /* XXX */
1769
1770         INSIST(sock->connecting);
1771
1772         sock->references++;  /* keep socket around for this internal event */
1773         iev->ev_sender = sock;
1774         iev->ev_action = internal_connect;
1775         iev->ev_arg = sock;
1776
1777         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1778 }
1779
1780 /*
1781  * Dequeue an item off the given socket's read queue, set the result code
1782  * in the done event to the one provided, and send it to the task it was
1783  * destined for.
1784  *
1785  * If the event to be sent is on a list, remove it before sending.  If
1786  * asked to, send and detach from the socket as well.
1787  *
1788  * Caller must have the socket locked if the event is attached to the socket.
1789  */
1790 static void
1791 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1792         isc_task_t *task;
1793
1794         task = (*dev)->ev_sender;
1795
1796         (*dev)->ev_sender = sock;
1797
1798         if (ISC_LINK_LINKED(*dev, ev_link))
1799                 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1800
1801         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1802             == ISC_SOCKEVENTATTR_ATTACHED)
1803                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1804         else
1805                 isc_task_send(task, (isc_event_t **)dev);
1806 }
1807
1808 /*
1809  * See comments for send_recvdone_event() above.
1810  *
1811  * Caller must have the socket locked if the event is attached to the socket.
1812  */
1813 static void
1814 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1815         isc_task_t *task;
1816
1817         INSIST(dev != NULL && *dev != NULL);
1818
1819         task = (*dev)->ev_sender;
1820         (*dev)->ev_sender = sock;
1821
1822         if (ISC_LINK_LINKED(*dev, ev_link))
1823                 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1824
1825         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1826             == ISC_SOCKEVENTATTR_ATTACHED)
1827                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1828         else
1829                 isc_task_send(task, (isc_event_t **)dev);
1830 }
1831
1832 /*
1833  * Call accept() on a socket, to get the new file descriptor.  The listen
1834  * socket is used as a prototype to create a new isc_socket_t.  The new
1835  * socket has one outstanding reference.  The task receiving the event
1836  * will be detached from just after the event is delivered.
1837  *
1838  * On entry to this function, the event delivered is the internal
1839  * readable event, and the first item on the accept_list should be
1840  * the done event we want to send.  If the list is empty, this is a no-op,
1841  * so just unlock and return.
1842  */
1843 static void
1844 internal_accept(isc_task_t *me, isc_event_t *ev) {
1845         isc_socket_t *sock;
1846         isc_socketmgr_t *manager;
1847         isc_socket_newconnev_t *dev;
1848         isc_task_t *task;
1849         ISC_SOCKADDR_LEN_T addrlen;
1850         int fd;
1851         isc_result_t result = ISC_R_SUCCESS;
1852         char strbuf[ISC_STRERRORSIZE];
1853         const char *err = "accept";
1854
1855         UNUSED(me);
1856
1857         sock = ev->ev_sender;
1858         INSIST(VALID_SOCKET(sock));
1859
1860         LOCK(&sock->lock);
1861         socket_log(sock, NULL, TRACE,
1862                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1863                    "internal_accept called, locked socket");
1864
1865         manager = sock->manager;
1866         INSIST(VALID_MANAGER(manager));
1867
1868         INSIST(sock->listener);
1869         INSIST(sock->pending_accept == 1);
1870         sock->pending_accept = 0;
1871
1872         INSIST(sock->references > 0);
1873         sock->references--;  /* the internal event is done with this socket */
1874         if (sock->references == 0) {
1875                 UNLOCK(&sock->lock);
1876                 destroy(&sock);
1877                 return;
1878         }
1879
1880         /*
1881          * Get the first item off the accept list.
1882          * If it is empty, unlock the socket and return.
1883          */
1884         dev = ISC_LIST_HEAD(sock->accept_list);
1885         if (dev == NULL) {
1886                 UNLOCK(&sock->lock);
1887                 return;
1888         }
1889
1890         /*
1891          * Try to accept the new connection.  If the accept fails with
1892          * EAGAIN or EINTR, simply poke the watcher to watch this socket
1893          * again.  Also ignore ECONNRESET, which has been reported to
1894          * be spuriously returned on Linux 2.2.19 although it is not
1895          * a documented error for accept().  ECONNABORTED has been
1896          * reported for Solaris 8.  The rest are thrown in not because
1897          * we have seen them but because they are ignored by other
1898          * deamons such as BIND 8 and Apache.
1899          */
1900
1901         addrlen = sizeof(dev->newsocket->address.type);
1902         memset(&dev->newsocket->address.type.sa, 0, addrlen);
1903         fd = accept(sock->fd, &dev->newsocket->address.type.sa,
1904                     (void *)&addrlen);
1905
1906 #ifdef F_DUPFD
1907         /*
1908          * Leave a space for stdio to work in.
1909          */
1910         if (fd >= 0 && fd < 20) {
1911                 int new, tmp;
1912                 new = fcntl(fd, F_DUPFD, 20);
1913                 tmp = errno;
1914                 (void)close(fd);
1915                 errno = tmp;
1916                 fd = new;
1917                 err = "fcntl";
1918         }
1919 #endif
1920
1921         if (fd < 0) {
1922                 if (SOFT_ERROR(errno))
1923                         goto soft_error;
1924                 switch (errno) {
1925                 case ENOBUFS:
1926                 case ENFILE:
1927                 case ENOMEM:
1928                 case ECONNRESET:
1929                 case ECONNABORTED:
1930                 case EHOSTUNREACH:
1931                 case EHOSTDOWN:
1932                 case ENETUNREACH:
1933                 case ENETDOWN:
1934                 case ECONNREFUSED:
1935 #ifdef EPROTO
1936                 case EPROTO:
1937 #endif
1938 #ifdef ENONET
1939                 case ENONET:
1940 #endif
1941                         goto soft_error;
1942                 default:
1943                         break;
1944                 }
1945                 isc__strerror(errno, strbuf, sizeof(strbuf));
1946                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1947                                  "internal_accept: %s() %s: %s", err,
1948                                  isc_msgcat_get(isc_msgcat,
1949                                                 ISC_MSGSET_GENERAL,
1950                                                 ISC_MSG_FAILED,
1951                                                 "failed"),
1952                                  strbuf);
1953                 fd = -1;
1954                 result = ISC_R_UNEXPECTED;
1955         } else {
1956                 if (addrlen == 0U) {
1957                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1958                                          "internal_accept(): "
1959                                          "accept() failed to return "
1960                                          "remote address");
1961
1962                         (void)close(fd);
1963                         goto soft_error;
1964                 } else if (dev->newsocket->address.type.sa.sa_family !=
1965                            sock->pf)
1966                 {
1967                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1968                                          "internal_accept(): "
1969                                          "accept() returned peer address "
1970                                          "family %u (expected %u)", 
1971                                          dev->newsocket->address.
1972                                          type.sa.sa_family,
1973                                          sock->pf);
1974                         (void)close(fd);
1975                         goto soft_error;
1976                 } else if (fd >= (int)FD_SETSIZE) {
1977                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1978                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1979                                        isc_msgcat, ISC_MSGSET_SOCKET,
1980                                        ISC_MSG_TOOMANYFDS,
1981                                        "%s: too many open file descriptors",
1982                                        "accept");
1983                         (void)close(fd);
1984                         goto soft_error;
1985                 }
1986         }
1987
1988         if (fd != -1) {
1989                 dev->newsocket->address.length = addrlen;
1990                 dev->newsocket->pf = sock->pf;
1991         }
1992
1993         /*
1994          * Pull off the done event.
1995          */
1996         ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
1997
1998         /*
1999          * Poke watcher if there are more pending accepts.
2000          */
2001         if (!ISC_LIST_EMPTY(sock->accept_list))
2002                 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2003
2004         UNLOCK(&sock->lock);
2005
2006         if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) {
2007                 (void)close(fd);
2008                 fd = -1;
2009                 result = ISC_R_UNEXPECTED;
2010         }
2011
2012         /*
2013          * -1 means the new socket didn't happen.
2014          */
2015         if (fd != -1) {
2016                 LOCK(&manager->lock);
2017                 ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
2018
2019                 dev->newsocket->fd = fd;
2020                 dev->newsocket->bound = 1;
2021                 dev->newsocket->connected = 1;
2022
2023                 /*
2024                  * Save away the remote address
2025                  */
2026                 dev->address = dev->newsocket->address;
2027
2028                 manager->fds[fd] = dev->newsocket;
2029                 manager->fdstate[fd] = MANAGED;
2030                 if (manager->maxfd < fd)
2031                         manager->maxfd = fd;
2032
2033                 socket_log(sock, &dev->newsocket->address, CREATION,
2034                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2035                            "accepted connection, new socket %p",
2036                            dev->newsocket);
2037
2038                 UNLOCK(&manager->lock);
2039         } else {
2040                 dev->newsocket->references--;
2041                 free_socket(&dev->newsocket);
2042         }
2043         
2044         /*
2045          * Fill in the done event details and send it off.
2046          */
2047         dev->result = result;
2048         task = dev->ev_sender;
2049         dev->ev_sender = sock;
2050
2051         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2052         return;
2053
2054  soft_error:
2055         select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2056         UNLOCK(&sock->lock);
2057         return;
2058 }
2059
2060 static void
2061 internal_recv(isc_task_t *me, isc_event_t *ev) {
2062         isc_socketevent_t *dev;
2063         isc_socket_t *sock;
2064
2065         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
2066
2067         sock = ev->ev_sender;
2068         INSIST(VALID_SOCKET(sock));
2069
2070         LOCK(&sock->lock);
2071         socket_log(sock, NULL, IOEVENT,
2072                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2073                    "internal_recv: task %p got event %p", me, ev);
2074
2075         INSIST(sock->pending_recv == 1);
2076         sock->pending_recv = 0;
2077
2078         INSIST(sock->references > 0);
2079         sock->references--;  /* the internal event is done with this socket */
2080         if (sock->references == 0) {
2081                 UNLOCK(&sock->lock);
2082                 destroy(&sock);
2083                 return;
2084         }
2085
2086         /*
2087          * Try to do as much I/O as possible on this socket.  There are no
2088          * limits here, currently.
2089          */
2090         dev = ISC_LIST_HEAD(sock->recv_list);
2091         while (dev != NULL) {
2092                 switch (doio_recv(sock, dev)) {
2093                 case DOIO_SOFT:
2094                         goto poke;
2095
2096                 case DOIO_EOF:
2097                         /*
2098                          * read of 0 means the remote end was closed.
2099                          * Run through the event queue and dispatch all
2100                          * the events with an EOF result code.
2101                          */
2102                         do {
2103                                 dev->result = ISC_R_EOF;
2104                                 send_recvdone_event(sock, &dev);
2105                                 dev = ISC_LIST_HEAD(sock->recv_list);
2106                         } while (dev != NULL);
2107                         goto poke;
2108
2109                 case DOIO_SUCCESS:
2110                 case DOIO_HARD:
2111                         send_recvdone_event(sock, &dev);
2112                         break;
2113                 }
2114
2115                 dev = ISC_LIST_HEAD(sock->recv_list);
2116         }
2117
2118  poke:
2119         if (!ISC_LIST_EMPTY(sock->recv_list))
2120                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2121
2122         UNLOCK(&sock->lock);
2123 }
2124
2125 static void
2126 internal_send(isc_task_t *me, isc_event_t *ev) {
2127         isc_socketevent_t *dev;
2128         isc_socket_t *sock;
2129
2130         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2131
2132         /*
2133          * Find out what socket this is and lock it.
2134          */
2135         sock = (isc_socket_t *)ev->ev_sender;
2136         INSIST(VALID_SOCKET(sock));
2137
2138         LOCK(&sock->lock);
2139         socket_log(sock, NULL, IOEVENT,
2140                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2141                    "internal_send: task %p got event %p", me, ev);
2142
2143         INSIST(sock->pending_send == 1);
2144         sock->pending_send = 0;
2145
2146         INSIST(sock->references > 0);
2147         sock->references--;  /* the internal event is done with this socket */
2148         if (sock->references == 0) {
2149                 UNLOCK(&sock->lock);
2150                 destroy(&sock);
2151                 return;
2152         }
2153
2154         /*
2155          * Try to do as much I/O as possible on this socket.  There are no
2156          * limits here, currently.
2157          */
2158         dev = ISC_LIST_HEAD(sock->send_list);
2159         while (dev != NULL) {
2160                 switch (doio_send(sock, dev)) {
2161                 case DOIO_SOFT:
2162                         goto poke;
2163
2164                 case DOIO_HARD:
2165                 case DOIO_SUCCESS:
2166                         send_senddone_event(sock, &dev);
2167                         break;
2168                 }
2169
2170                 dev = ISC_LIST_HEAD(sock->send_list);
2171         }
2172
2173  poke:
2174         if (!ISC_LIST_EMPTY(sock->send_list))
2175                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2176
2177         UNLOCK(&sock->lock);
2178 }
2179
2180 static void
2181 process_fds(isc_socketmgr_t *manager, int maxfd,
2182             fd_set *readfds, fd_set *writefds)
2183 {
2184         int i;
2185         isc_socket_t *sock;
2186         isc_boolean_t unlock_sock;
2187
2188         REQUIRE(maxfd <= (int)FD_SETSIZE);
2189
2190         /*
2191          * Process read/writes on other fds here.  Avoid locking
2192          * and unlocking twice if both reads and writes are possible.
2193          */
2194         for (i = 0; i < maxfd; i++) {
2195 #ifdef ISC_PLATFORM_USETHREADS
2196                 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
2197                         continue;
2198 #endif /* ISC_PLATFORM_USETHREADS */
2199
2200                 if (manager->fdstate[i] == CLOSE_PENDING) {
2201                         manager->fdstate[i] = CLOSED;
2202                         FD_CLR(i, &manager->read_fds);
2203                         FD_CLR(i, &manager->write_fds);
2204
2205                         (void)close(i);
2206
2207                         continue;
2208                 }
2209
2210                 sock = manager->fds[i];
2211                 unlock_sock = ISC_FALSE;
2212                 if (FD_ISSET(i, readfds)) {
2213                         if (sock == NULL) {
2214                                 FD_CLR(i, &manager->read_fds);
2215                                 goto check_write;
2216                         }
2217                         unlock_sock = ISC_TRUE;
2218                         LOCK(&sock->lock);
2219                         if (!SOCK_DEAD(sock)) {
2220                                 if (sock->listener)
2221                                         dispatch_accept(sock);
2222                                 else
2223                                         dispatch_recv(sock);
2224                         }
2225                         FD_CLR(i, &manager->read_fds);
2226                 }
2227         check_write:
2228                 if (FD_ISSET(i, writefds)) {
2229                         if (sock == NULL) {
2230                                 FD_CLR(i, &manager->write_fds);
2231                                 continue;
2232                         }
2233                         if (!unlock_sock) {
2234                                 unlock_sock = ISC_TRUE;
2235                                 LOCK(&sock->lock);
2236                         }
2237                         if (!SOCK_DEAD(sock)) {
2238                                 if (sock->connecting)
2239                                         dispatch_connect(sock);
2240                                 else
2241                                         dispatch_send(sock);
2242                         }
2243                         FD_CLR(i, &manager->write_fds);
2244                 }
2245                 if (unlock_sock)
2246                         UNLOCK(&sock->lock);
2247         }
2248 }
2249
2250 #ifdef ISC_PLATFORM_USETHREADS
2251 /*
2252  * This is the thread that will loop forever, always in a select or poll
2253  * call.
2254  *
2255  * When select returns something to do, track down what thread gets to do
2256  * this I/O and post the event to it.
2257  */
2258 static isc_threadresult_t
2259 watcher(void *uap) {
2260         isc_socketmgr_t *manager = uap;
2261         isc_boolean_t done;
2262         int ctlfd;
2263         int cc;
2264         fd_set readfds;
2265         fd_set writefds;
2266         int msg, fd;
2267         int maxfd;
2268         char strbuf[ISC_STRERRORSIZE];
2269
2270         /*
2271          * Get the control fd here.  This will never change.
2272          */
2273         LOCK(&manager->lock);
2274         ctlfd = manager->pipe_fds[0];
2275
2276         done = ISC_FALSE;
2277         while (!done) {
2278                 do {
2279                         readfds = manager->read_fds;
2280                         writefds = manager->write_fds;
2281                         maxfd = manager->maxfd + 1;
2282
2283                         UNLOCK(&manager->lock);
2284
2285                         cc = select(maxfd, &readfds, &writefds, NULL, NULL);
2286                         if (cc < 0) {
2287                                 if (!SOFT_ERROR(errno)) {
2288                                         isc__strerror(errno, strbuf,
2289                                                       sizeof(strbuf));
2290                                         FATAL_ERROR(__FILE__, __LINE__,
2291                                                     "select() %s: %s",
2292                                                     isc_msgcat_get(isc_msgcat,
2293                                                             ISC_MSGSET_GENERAL,
2294                                                             ISC_MSG_FAILED,
2295                                                             "failed"),
2296                                                     strbuf);
2297                                 }
2298                         }
2299
2300                         LOCK(&manager->lock);
2301                 } while (cc < 0);
2302
2303
2304                 /*
2305                  * Process reads on internal, control fd.
2306                  */
2307                 if (FD_ISSET(ctlfd, &readfds)) {
2308                         for (;;) {
2309                                 select_readmsg(manager, &fd, &msg);
2310
2311                                 manager_log(manager, IOEVENT,
2312                                             isc_msgcat_get(isc_msgcat,
2313                                                      ISC_MSGSET_SOCKET,
2314                                                      ISC_MSG_WATCHERMSG,
2315                                                      "watcher got message %d"),
2316                                                      msg);
2317
2318                                 /*
2319                                  * Nothing to read?
2320                                  */
2321                                 if (msg == SELECT_POKE_NOTHING)
2322                                         break;
2323
2324                                 /*
2325                                  * Handle shutdown message.  We really should
2326                                  * jump out of this loop right away, but
2327                                  * it doesn't matter if we have to do a little
2328                                  * more work first.
2329                                  */
2330                                 if (msg == SELECT_POKE_SHUTDOWN) {
2331                                         done = ISC_TRUE;
2332
2333                                         break;
2334                                 }
2335
2336                                 /*
2337                                  * This is a wakeup on a socket.  Look
2338                                  * at the event queue for both read and write,
2339                                  * and decide if we need to watch on it now
2340                                  * or not.
2341                                  */
2342                                 wakeup_socket(manager, fd, msg);
2343                         }
2344                 }
2345
2346                 process_fds(manager, maxfd, &readfds, &writefds);
2347         }
2348
2349         manager_log(manager, TRACE,
2350                     isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2351                                    ISC_MSG_EXITING, "watcher exiting"));
2352
2353         UNLOCK(&manager->lock);
2354         return ((isc_threadresult_t)0);
2355 }
2356 #endif /* ISC_PLATFORM_USETHREADS */
2357
2358 /*
2359  * Create a new socket manager.
2360  */
2361 isc_result_t
2362 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2363         isc_socketmgr_t *manager;
2364 #ifdef ISC_PLATFORM_USETHREADS
2365         char strbuf[ISC_STRERRORSIZE];
2366 #endif
2367         isc_result_t result;
2368
2369         REQUIRE(managerp != NULL && *managerp == NULL);
2370
2371 #ifndef ISC_PLATFORM_USETHREADS
2372         if (socketmgr != NULL) {
2373                 socketmgr->refs++;
2374                 *managerp = socketmgr;
2375                 return (ISC_R_SUCCESS);
2376         }
2377 #endif /* ISC_PLATFORM_USETHREADS */
2378
2379         manager = isc_mem_get(mctx, sizeof(*manager));
2380         if (manager == NULL)
2381                 return (ISC_R_NOMEMORY);
2382
2383         manager->magic = SOCKET_MANAGER_MAGIC;
2384         manager->mctx = NULL;
2385         memset(manager->fds, 0, sizeof(manager->fds));
2386         ISC_LIST_INIT(manager->socklist);
2387         result = isc_mutex_init(&manager->lock);
2388         if (result != ISC_R_SUCCESS) {
2389                 isc_mem_put(mctx, manager, sizeof(*manager));
2390                 return (result);
2391         }
2392 #ifdef ISC_PLATFORM_USETHREADS
2393         if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2394                 DESTROYLOCK(&manager->lock);
2395                 isc_mem_put(mctx, manager, sizeof(*manager));
2396                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2397                                  "isc_condition_init() %s",
2398                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2399                                                 ISC_MSG_FAILED, "failed"));
2400                 return (ISC_R_UNEXPECTED);
2401         }
2402
2403         /*
2404          * Create the special fds that will be used to wake up the
2405          * select/poll loop when something internal needs to be done.
2406          */
2407         if (pipe(manager->pipe_fds) != 0) {
2408                 DESTROYLOCK(&manager->lock);
2409                 isc_mem_put(mctx, manager, sizeof(*manager));
2410                 isc__strerror(errno, strbuf, sizeof(strbuf));
2411                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2412                                  "pipe() %s: %s",
2413                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2414                                                 ISC_MSG_FAILED, "failed"),
2415                                  strbuf);
2416
2417                 return (ISC_R_UNEXPECTED);
2418         }
2419
2420         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
2421 #if 0
2422         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
2423 #endif
2424 #else /* ISC_PLATFORM_USETHREADS */
2425         manager->refs = 1;
2426 #endif /* ISC_PLATFORM_USETHREADS */
2427
2428         /*
2429          * Set up initial state for the select loop
2430          */
2431         FD_ZERO(&manager->read_fds);
2432         FD_ZERO(&manager->write_fds);
2433 #ifdef ISC_PLATFORM_USETHREADS
2434         FD_SET(manager->pipe_fds[0], &manager->read_fds);
2435         manager->maxfd = manager->pipe_fds[0];
2436 #else /* ISC_PLATFORM_USETHREADS */
2437         manager->maxfd = 0;
2438 #endif /* ISC_PLATFORM_USETHREADS */
2439         memset(manager->fdstate, 0, sizeof(manager->fdstate));
2440
2441 #ifdef ISC_PLATFORM_USETHREADS
2442         /*
2443          * Start up the select/poll thread.
2444          */
2445         if (isc_thread_create(watcher, manager, &manager->watcher) !=
2446             ISC_R_SUCCESS) {
2447                 (void)close(manager->pipe_fds[0]);
2448                 (void)close(manager->pipe_fds[1]);
2449                 DESTROYLOCK(&manager->lock);
2450                 isc_mem_put(mctx, manager, sizeof(*manager));
2451                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2452                                  "isc_thread_create() %s",
2453                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2454                                                 ISC_MSG_FAILED, "failed"));
2455                 return (ISC_R_UNEXPECTED);
2456         }
2457 #endif /* ISC_PLATFORM_USETHREADS */
2458         isc_mem_attach(mctx, &manager->mctx);
2459
2460 #ifndef ISC_PLATFORM_USETHREADS
2461         socketmgr = manager;
2462 #endif /* ISC_PLATFORM_USETHREADS */
2463         *managerp = manager;
2464
2465         return (ISC_R_SUCCESS);
2466 }
2467
2468 void
2469 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
2470         isc_socketmgr_t *manager;
2471         int i;
2472         isc_mem_t *mctx;
2473
2474         /*
2475          * Destroy a socket manager.
2476          */
2477
2478         REQUIRE(managerp != NULL);
2479         manager = *managerp;
2480         REQUIRE(VALID_MANAGER(manager));
2481
2482 #ifndef ISC_PLATFORM_USETHREADS
2483         if (manager->refs > 1) {
2484                 manager->refs--;
2485                 *managerp = NULL;
2486                 return;
2487         }
2488 #endif /* ISC_PLATFORM_USETHREADS */
2489
2490         LOCK(&manager->lock);
2491
2492 #ifdef ISC_PLATFORM_USETHREADS
2493         /*
2494          * Wait for all sockets to be destroyed.
2495          */
2496         while (!ISC_LIST_EMPTY(manager->socklist)) {
2497                 manager_log(manager, CREATION,
2498                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2499                                            ISC_MSG_SOCKETSREMAIN,
2500                                            "sockets exist"));
2501                 WAIT(&manager->shutdown_ok, &manager->lock);
2502         }
2503 #else /* ISC_PLATFORM_USETHREADS */
2504         /*
2505          * Hope all sockets have been destroyed.
2506          */
2507         if (!ISC_LIST_EMPTY(manager->socklist)) {
2508                 manager_log(manager, CREATION,
2509                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2510                                            ISC_MSG_SOCKETSREMAIN,
2511                                            "sockets exist"));
2512                 INSIST(0);
2513         }
2514 #endif /* ISC_PLATFORM_USETHREADS */
2515
2516         UNLOCK(&manager->lock);
2517
2518         /*
2519          * Here, poke our select/poll thread.  Do this by closing the write
2520          * half of the pipe, which will send EOF to the read half.
2521          * This is currently a no-op in the non-threaded case.
2522          */
2523         select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
2524
2525 #ifdef ISC_PLATFORM_USETHREADS
2526         /*
2527          * Wait for thread to exit.
2528          */
2529         if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
2530                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2531                                  "isc_thread_join() %s",
2532                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2533                                                 ISC_MSG_FAILED, "failed"));
2534 #endif /* ISC_PLATFORM_USETHREADS */
2535
2536         /*
2537          * Clean up.
2538          */
2539 #ifdef ISC_PLATFORM_USETHREADS
2540         (void)close(manager->pipe_fds[0]);
2541         (void)close(manager->pipe_fds[1]);
2542         (void)isc_condition_destroy(&manager->shutdown_ok);
2543 #endif /* ISC_PLATFORM_USETHREADS */
2544
2545         for (i = 0; i < (int)FD_SETSIZE; i++)
2546                 if (manager->fdstate[i] == CLOSE_PENDING)
2547                         (void)close(i);
2548
2549         DESTROYLOCK(&manager->lock);
2550         manager->magic = 0;
2551         mctx= manager->mctx;
2552         isc_mem_put(mctx, manager, sizeof(*manager));
2553
2554         isc_mem_detach(&mctx);
2555
2556         *managerp = NULL;
2557 }
2558
2559 static isc_result_t
2560 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2561             unsigned int flags)
2562 {
2563         int io_state;
2564         isc_boolean_t have_lock = ISC_FALSE;
2565         isc_task_t *ntask = NULL;
2566         isc_result_t result = ISC_R_SUCCESS;
2567
2568         dev->ev_sender = task;
2569
2570         if (sock->type == isc_sockettype_udp) {
2571                 io_state = doio_recv(sock, dev);
2572         } else {
2573                 LOCK(&sock->lock);
2574                 have_lock = ISC_TRUE;
2575
2576                 if (ISC_LIST_EMPTY(sock->recv_list))
2577                         io_state = doio_recv(sock, dev);
2578                 else
2579                         io_state = DOIO_SOFT;
2580         }
2581
2582         switch (io_state) {
2583         case DOIO_SOFT:
2584                 /*
2585                  * We couldn't read all or part of the request right now, so
2586                  * queue it.
2587                  *
2588                  * Attach to socket and to task
2589                  */
2590                 isc_task_attach(task, &ntask);
2591                 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2592
2593                 if (!have_lock) {
2594                         LOCK(&sock->lock);
2595                         have_lock = ISC_TRUE;
2596                 }
2597
2598                 /*
2599                  * Enqueue the request.  If the socket was previously not being
2600                  * watched, poke the watcher to start paying attention to it.
2601                  */
2602                 if (ISC_LIST_EMPTY(sock->recv_list))
2603                         select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2604                 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2605
2606                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2607                            "socket_recv: event %p -> task %p",
2608                            dev, ntask);
2609
2610                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2611                         result = ISC_R_INPROGRESS;
2612                 break;
2613
2614         case DOIO_EOF:
2615                 dev->result = ISC_R_EOF;
2616                 /* fallthrough */
2617
2618         case DOIO_HARD:
2619         case DOIO_SUCCESS:
2620                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2621                         send_recvdone_event(sock, &dev);
2622                 break;
2623         }
2624
2625         if (have_lock)
2626                 UNLOCK(&sock->lock);
2627
2628         return (result);
2629 }
2630
2631 isc_result_t
2632 isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2633                  unsigned int minimum, isc_task_t *task,
2634                  isc_taskaction_t action, const void *arg)
2635 {
2636         isc_socketevent_t *dev;
2637         isc_socketmgr_t *manager;
2638         unsigned int iocount;
2639         isc_buffer_t *buffer;
2640
2641         REQUIRE(VALID_SOCKET(sock));
2642         REQUIRE(buflist != NULL);
2643         REQUIRE(!ISC_LIST_EMPTY(*buflist));
2644         REQUIRE(task != NULL);
2645         REQUIRE(action != NULL);
2646
2647         manager = sock->manager;
2648         REQUIRE(VALID_MANAGER(manager));
2649
2650         iocount = isc_bufferlist_availablecount(buflist);
2651         REQUIRE(iocount > 0);
2652
2653         INSIST(sock->bound);
2654
2655         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2656         if (dev == NULL) {
2657                 return (ISC_R_NOMEMORY);
2658         }
2659
2660         /*
2661          * UDP sockets are always partial read
2662          */
2663         if (sock->type == isc_sockettype_udp)
2664                 dev->minimum = 1;
2665         else {
2666                 if (minimum == 0)
2667                         dev->minimum = iocount;
2668                 else
2669                         dev->minimum = minimum;
2670         }
2671
2672         /*
2673          * Move each buffer from the passed in list to our internal one.
2674          */
2675         buffer = ISC_LIST_HEAD(*buflist);
2676         while (buffer != NULL) {
2677                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
2678                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2679                 buffer = ISC_LIST_HEAD(*buflist);
2680         }
2681
2682         return (socket_recv(sock, dev, task, 0));
2683 }
2684
2685 isc_result_t
2686 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
2687                 isc_task_t *task, isc_taskaction_t action, const void *arg)
2688 {
2689         isc_socketevent_t *dev;
2690         isc_socketmgr_t *manager;
2691
2692         REQUIRE(VALID_SOCKET(sock));
2693         REQUIRE(action != NULL);
2694
2695         manager = sock->manager;
2696         REQUIRE(VALID_MANAGER(manager));
2697
2698         INSIST(sock->bound);
2699
2700         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2701         if (dev == NULL)
2702                 return (ISC_R_NOMEMORY);
2703
2704         return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
2705 }
2706
2707 isc_result_t
2708 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
2709                  unsigned int minimum, isc_task_t *task,
2710                  isc_socketevent_t *event, unsigned int flags)
2711 {
2712         event->ev_sender = sock;
2713         event->result = ISC_R_UNEXPECTED;
2714         ISC_LIST_INIT(event->bufferlist);
2715         event->region = *region;
2716         event->n = 0;
2717         event->offset = 0;
2718         event->attributes = 0;
2719
2720         /*
2721          * UDP sockets are always partial read.
2722          */
2723         if (sock->type == isc_sockettype_udp)
2724                 event->minimum = 1;
2725         else {
2726                 if (minimum == 0)
2727                         event->minimum = region->length;
2728                 else
2729                         event->minimum = minimum;
2730         }
2731
2732         return (socket_recv(sock, event, task, flags));
2733 }
2734
2735 static isc_result_t
2736 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2737             isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2738             unsigned int flags)
2739 {
2740         int io_state;
2741         isc_boolean_t have_lock = ISC_FALSE;
2742         isc_task_t *ntask = NULL;
2743         isc_result_t result = ISC_R_SUCCESS;
2744
2745         dev->ev_sender = task;
2746
2747         set_dev_address(address, sock, dev);
2748         if (pktinfo != NULL) {
2749                 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2750                 dev->pktinfo = *pktinfo;
2751
2752                 if (!isc_sockaddr_issitelocal(&dev->address) &&
2753                     !isc_sockaddr_islinklocal(&dev->address)) {
2754                         socket_log(sock, NULL, TRACE, isc_msgcat,
2755                                    ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
2756                                    "pktinfo structure provided, ifindex %u "
2757                                    "(set to 0)", pktinfo->ipi6_ifindex);
2758
2759                         /*
2760                          * Set the pktinfo index to 0 here, to let the
2761                          * kernel decide what interface it should send on.
2762                          */
2763                         dev->pktinfo.ipi6_ifindex = 0;
2764                 }
2765         }
2766
2767         if (sock->type == isc_sockettype_udp)
2768                 io_state = doio_send(sock, dev);
2769         else {
2770                 LOCK(&sock->lock);
2771                 have_lock = ISC_TRUE;
2772
2773                 if (ISC_LIST_EMPTY(sock->send_list))
2774                         io_state = doio_send(sock, dev);
2775                 else
2776                         io_state = DOIO_SOFT;
2777         }
2778
2779         switch (io_state) {
2780         case DOIO_SOFT:
2781                 /*
2782                  * We couldn't send all or part of the request right now, so
2783                  * queue it unless ISC_SOCKFLAG_NORETRY is set.
2784                  */
2785                 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2786                         isc_task_attach(task, &ntask);
2787                         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2788
2789                         if (!have_lock) {
2790                                 LOCK(&sock->lock);
2791                                 have_lock = ISC_TRUE;
2792                         }
2793
2794                         /*
2795                          * Enqueue the request.  If the socket was previously
2796                          * not being watched, poke the watcher to start
2797                          * paying attention to it.
2798                          */
2799                         if (ISC_LIST_EMPTY(sock->send_list))
2800                                 select_poke(sock->manager, sock->fd,
2801                                             SELECT_POKE_WRITE);
2802                         ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2803
2804                         socket_log(sock, NULL, EVENT, NULL, 0, 0,
2805                                    "socket_send: event %p -> task %p",
2806                                    dev, ntask);
2807
2808                         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2809                                 result = ISC_R_INPROGRESS;
2810                         break;
2811                 }
2812
2813         case DOIO_HARD:
2814         case DOIO_SUCCESS:
2815                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2816                         send_senddone_event(sock, &dev);
2817                 break;
2818         }
2819
2820         if (have_lock)
2821                 UNLOCK(&sock->lock);
2822
2823         return (result);
2824 }
2825
2826 isc_result_t
2827 isc_socket_send(isc_socket_t *sock, isc_region_t *region,
2828                 isc_task_t *task, isc_taskaction_t action, const void *arg)
2829 {
2830         /*
2831          * REQUIRE() checking is performed in isc_socket_sendto().
2832          */
2833         return (isc_socket_sendto(sock, region, task, action, arg, NULL,
2834                                   NULL));
2835 }
2836
2837 isc_result_t
2838 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
2839                   isc_task_t *task, isc_taskaction_t action, const void *arg,
2840                   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
2841 {
2842         isc_socketevent_t *dev;
2843         isc_socketmgr_t *manager;
2844
2845         REQUIRE(VALID_SOCKET(sock));
2846         REQUIRE(region != NULL);
2847         REQUIRE(task != NULL);
2848         REQUIRE(action != NULL);
2849
2850         manager = sock->manager;
2851         REQUIRE(VALID_MANAGER(manager));
2852
2853         INSIST(sock->bound);
2854
2855         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
2856         if (dev == NULL) {
2857                 return (ISC_R_NOMEMORY);
2858         }
2859
2860         dev->region = *region;
2861
2862         return (socket_send(sock, dev, task, address, pktinfo, 0));
2863 }
2864
2865 isc_result_t
2866 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2867                  isc_task_t *task, isc_taskaction_t action, const void *arg)
2868 {
2869         return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
2870                                    NULL));
2871 }
2872
2873 isc_result_t
2874 isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
2875                    isc_task_t *task, isc_taskaction_t action, const void *arg,
2876                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
2877 {
2878         isc_socketevent_t *dev;
2879         isc_socketmgr_t *manager;
2880         unsigned int iocount;
2881         isc_buffer_t *buffer;
2882
2883         REQUIRE(VALID_SOCKET(sock));
2884         REQUIRE(buflist != NULL);
2885         REQUIRE(!ISC_LIST_EMPTY(*buflist));
2886         REQUIRE(task != NULL);
2887         REQUIRE(action != NULL);
2888
2889         manager = sock->manager;
2890         REQUIRE(VALID_MANAGER(manager));
2891
2892         iocount = isc_bufferlist_usedcount(buflist);
2893         REQUIRE(iocount > 0);
2894
2895         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
2896         if (dev == NULL) {
2897                 return (ISC_R_NOMEMORY);
2898         }
2899
2900         /*
2901          * Move each buffer from the passed in list to our internal one.
2902          */
2903         buffer = ISC_LIST_HEAD(*buflist);
2904         while (buffer != NULL) {
2905                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
2906                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2907                 buffer = ISC_LIST_HEAD(*buflist);
2908         }
2909
2910         return (socket_send(sock, dev, task, address, pktinfo, 0));
2911 }
2912
2913 isc_result_t
2914 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
2915                    isc_task_t *task,
2916                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2917                    isc_socketevent_t *event, unsigned int flags)
2918 {
2919         REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
2920         if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
2921                 REQUIRE(sock->type == isc_sockettype_udp);
2922         event->ev_sender = sock;
2923         event->result = ISC_R_UNEXPECTED;
2924         ISC_LIST_INIT(event->bufferlist);
2925         event->region = *region;
2926         event->n = 0;
2927         event->offset = 0;
2928         event->attributes = 0;
2929
2930         return (socket_send(sock, event, task, address, pktinfo, flags));
2931 }
2932
2933 void
2934 isc_socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
2935 #ifdef ISC_PLATFORM_HAVESYSUNH
2936         int s;
2937         struct stat sb;
2938         char strbuf[ISC_STRERRORSIZE];
2939
2940         if (sockaddr->type.sa.sa_family != AF_UNIX)
2941                 return;
2942
2943 #ifndef S_ISSOCK
2944 #if defined(S_IFMT) && defined(S_IFSOCK)
2945 #define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
2946 #elif defined(_S_IFMT) && defined(S_IFSOCK)
2947 #define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
2948 #endif
2949 #endif
2950
2951 #ifndef S_ISFIFO
2952 #if defined(S_IFMT) && defined(S_IFIFO)
2953 #define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
2954 #elif defined(_S_IFMT) && defined(S_IFIFO)
2955 #define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
2956 #endif
2957 #endif
2958
2959 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
2960 #error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
2961 #endif
2962
2963 #ifndef S_ISFIFO
2964 #define S_ISFIFO(mode) 0
2965 #endif
2966
2967 #ifndef S_ISSOCK
2968 #define S_ISSOCK(mode) 0
2969 #endif
2970
2971         if (active) {
2972                 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
2973                         isc__strerror(errno, strbuf, sizeof(strbuf));
2974                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2975                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2976                                       "isc_socket_cleanunix: stat(%s): %s",
2977                                       sockaddr->type.sunix.sun_path, strbuf);
2978                         return;
2979                 }
2980                 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
2981                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2982                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2983                                       "isc_socket_cleanunix: %s: not a socket",
2984                                       sockaddr->type.sunix.sun_path);
2985                         return;
2986                 }
2987                 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
2988                         isc__strerror(errno, strbuf, sizeof(strbuf));
2989                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2990                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2991                                       "isc_socket_cleanunix: unlink(%s): %s",
2992                                       sockaddr->type.sunix.sun_path, strbuf);
2993                 }
2994                 return;
2995         }
2996
2997         s = socket(AF_UNIX, SOCK_STREAM, 0);
2998         if (s < 0) {
2999                 isc__strerror(errno, strbuf, sizeof(strbuf));
3000                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3001                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
3002                               "isc_socket_cleanunix: socket(%s): %s",
3003                               sockaddr->type.sunix.sun_path, strbuf);
3004                 return;
3005         }
3006
3007         if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
3008                 switch (errno) {
3009                 case ENOENT:    /* We exited cleanly last time */
3010                         break;
3011                 default:
3012                         isc__strerror(errno, strbuf, sizeof(strbuf));
3013                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3014                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
3015                                       "isc_socket_cleanunix: stat(%s): %s",
3016                                       sockaddr->type.sunix.sun_path, strbuf);
3017                         break;
3018                 }
3019                 goto cleanup;
3020         }
3021
3022         if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
3023                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3024                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
3025                               "isc_socket_cleanunix: %s: not a socket",
3026                               sockaddr->type.sunix.sun_path);
3027                 goto cleanup;
3028         }
3029
3030         if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
3031                     sizeof(sockaddr->type.sunix)) < 0) {
3032                 switch (errno) {
3033                 case ECONNREFUSED:
3034                 case ECONNRESET:
3035                         if (unlink(sockaddr->type.sunix.sun_path) < 0) {
3036                                 isc__strerror(errno, strbuf, sizeof(strbuf));
3037                                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3038                                               ISC_LOGMODULE_SOCKET,
3039                                               ISC_LOG_WARNING,
3040                                               "isc_socket_cleanunix: "
3041                                               "unlink(%s): %s",
3042                                               sockaddr->type.sunix.sun_path,
3043                                               strbuf);
3044                         }
3045                         break;
3046                 default:
3047                         isc__strerror(errno, strbuf, sizeof(strbuf));
3048                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3049                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
3050                                       "isc_socket_cleanunix: connect(%s): %s",
3051                                       sockaddr->type.sunix.sun_path, strbuf);
3052                         break;
3053                 }
3054         }
3055  cleanup:
3056         close(s);
3057 #else
3058         UNUSED(sockaddr);
3059         UNUSED(active);
3060 #endif
3061 }
3062
3063 isc_result_t
3064 isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
3065                     isc_uint32_t owner, isc_uint32_t group)
3066 {
3067 #ifdef ISC_PLATFORM_HAVESYSUNH
3068         isc_result_t result = ISC_R_SUCCESS;
3069         char strbuf[ISC_STRERRORSIZE];
3070         char path[sizeof(sockaddr->type.sunix.sun_path)];
3071 #ifdef NEED_SECURE_DIRECTORY
3072         char *slash;
3073 #endif
3074
3075         REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
3076         INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
3077         strcpy(path, sockaddr->type.sunix.sun_path);
3078
3079 #ifdef NEED_SECURE_DIRECTORY
3080         slash = strrchr(path, '/');
3081         if (slash != NULL) {
3082                 if (slash != path)
3083                         *slash = '\0';
3084                 else
3085                         strcpy(path, "/");
3086         } else
3087                 strcpy(path, ".");
3088 #endif
3089         
3090         if (chmod(path, perm) < 0) {
3091                 isc__strerror(errno, strbuf, sizeof(strbuf));
3092                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3093                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3094                               "isc_socket_permunix: chmod(%s, %d): %s",
3095                               path, perm, strbuf);
3096                 result = ISC_R_FAILURE;
3097         }
3098         if (chown(path, owner, group) < 0) {
3099                 isc__strerror(errno, strbuf, sizeof(strbuf));
3100                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3101                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3102                               "isc_socket_permunix: chown(%s, %d, %d): %s",
3103                               path, owner, group,
3104                               strbuf);
3105                 result = ISC_R_FAILURE;
3106         }
3107         return (result);
3108 #else
3109         UNUSED(sockaddr);
3110         UNUSED(perm);
3111         UNUSED(owner);
3112         UNUSED(group);
3113         return (ISC_R_NOTIMPLEMENTED);
3114 #endif
3115 }
3116
3117 isc_result_t
3118 isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr) {
3119         char strbuf[ISC_STRERRORSIZE];
3120         int on = 1;
3121
3122         LOCK(&sock->lock);
3123
3124         INSIST(!sock->bound);
3125
3126         if (sock->pf != sockaddr->type.sa.sa_family) {
3127                 UNLOCK(&sock->lock);
3128                 return (ISC_R_FAMILYMISMATCH);
3129         }
3130         /*
3131          * Only set SO_REUSEADDR when we want a specific port.
3132          */
3133 #ifdef AF_UNIX
3134         if (sock->pf == AF_UNIX)
3135                 goto bind_socket;
3136 #endif
3137         if (isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
3138             setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
3139                        sizeof(on)) < 0) {
3140                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3141                                  "setsockopt(%d) %s", sock->fd,
3142                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3143                                                 ISC_MSG_FAILED, "failed"));
3144                 /* Press on... */
3145         }
3146 #ifdef AF_UNIX
3147  bind_socket:
3148 #endif
3149         if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
3150                 UNLOCK(&sock->lock);
3151                 switch (errno) {
3152                 case EACCES:
3153                         return (ISC_R_NOPERM);
3154                 case EADDRNOTAVAIL:
3155                         return (ISC_R_ADDRNOTAVAIL);
3156                 case EADDRINUSE:
3157                         return (ISC_R_ADDRINUSE);
3158                 case EINVAL:
3159                         return (ISC_R_BOUND);
3160                 default:
3161                         isc__strerror(errno, strbuf, sizeof(strbuf));
3162                         UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
3163                                          strbuf);
3164                         return (ISC_R_UNEXPECTED);
3165                 }
3166         }
3167
3168         socket_log(sock, sockaddr, TRACE,
3169                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
3170         sock->bound = 1;
3171
3172         UNLOCK(&sock->lock);
3173         return (ISC_R_SUCCESS);
3174 }
3175
3176 isc_result_t
3177 isc_socket_filter(isc_socket_t *sock, const char *filter) {
3178 #ifdef SO_ACCEPTFILTER
3179         char strbuf[ISC_STRERRORSIZE];
3180         struct accept_filter_arg afa;
3181 #else
3182         UNUSED(sock);
3183         UNUSED(filter);
3184 #endif
3185
3186         REQUIRE(VALID_SOCKET(sock));
3187
3188 #ifdef SO_ACCEPTFILTER
3189         bzero(&afa, sizeof(afa));
3190         strncpy(afa.af_name, filter, sizeof(afa.af_name));
3191         if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
3192                          &afa, sizeof(afa)) == -1) {
3193                 isc__strerror(errno, strbuf, sizeof(strbuf));
3194                 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
3195                            ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
3196                            strbuf);
3197                 return (ISC_R_FAILURE);
3198         }
3199         return (ISC_R_SUCCESS);
3200 #else
3201         return (ISC_R_NOTIMPLEMENTED);
3202 #endif
3203 }
3204
3205 /*
3206  * Set up to listen on a given socket.  We do this by creating an internal
3207  * event that will be dispatched when the socket has read activity.  The
3208  * watcher will send the internal event to the task when there is a new
3209  * connection.
3210  *
3211  * Unlike in read, we don't preallocate a done event here.  Every time there
3212  * is a new connection we'll have to allocate a new one anyway, so we might
3213  * as well keep things simple rather than having to track them.
3214  */
3215 isc_result_t
3216 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
3217         char strbuf[ISC_STRERRORSIZE];
3218
3219         REQUIRE(VALID_SOCKET(sock));
3220
3221         LOCK(&sock->lock);
3222
3223         REQUIRE(!sock->listener);
3224         REQUIRE(sock->bound);
3225         REQUIRE(sock->type == isc_sockettype_tcp ||
3226                 sock->type == isc_sockettype_unix);
3227
3228         if (backlog == 0)
3229                 backlog = SOMAXCONN;
3230
3231         if (listen(sock->fd, (int)backlog) < 0) {
3232                 UNLOCK(&sock->lock);
3233                 isc__strerror(errno, strbuf, sizeof(strbuf));
3234
3235                 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
3236
3237                 return (ISC_R_UNEXPECTED);
3238         }
3239
3240         sock->listener = 1;
3241
3242         UNLOCK(&sock->lock);
3243         return (ISC_R_SUCCESS);
3244 }
3245
3246 /*
3247  * This should try to do agressive accept() XXXMLG
3248  */
3249 isc_result_t
3250 isc_socket_accept(isc_socket_t *sock,
3251                   isc_task_t *task, isc_taskaction_t action, const void *arg)
3252 {
3253         isc_socket_newconnev_t *dev;
3254         isc_socketmgr_t *manager;
3255         isc_task_t *ntask = NULL;
3256         isc_socket_t *nsock;
3257         isc_result_t result;
3258         isc_boolean_t do_poke = ISC_FALSE;
3259
3260         REQUIRE(VALID_SOCKET(sock));
3261         manager = sock->manager;
3262         REQUIRE(VALID_MANAGER(manager));
3263
3264         LOCK(&sock->lock);
3265
3266         REQUIRE(sock->listener);
3267
3268         /*
3269          * Sender field is overloaded here with the task we will be sending
3270          * this event to.  Just before the actual event is delivered the
3271          * actual ev_sender will be touched up to be the socket.
3272          */
3273         dev = (isc_socket_newconnev_t *)
3274                 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
3275                                    action, arg, sizeof(*dev));
3276         if (dev == NULL) {
3277                 UNLOCK(&sock->lock);
3278                 return (ISC_R_NOMEMORY);
3279         }
3280         ISC_LINK_INIT(dev, ev_link);
3281
3282         result = allocate_socket(manager, sock->type, &nsock);
3283         if (result != ISC_R_SUCCESS) {
3284                 isc_event_free(ISC_EVENT_PTR(&dev));
3285                 UNLOCK(&sock->lock);
3286                 return (result);
3287         }
3288
3289         /*
3290          * Attach to socket and to task.
3291          */
3292         isc_task_attach(task, &ntask);
3293         nsock->references++;
3294
3295         dev->ev_sender = ntask;
3296         dev->newsocket = nsock;
3297
3298         /*
3299          * Poke watcher here.  We still have the socket locked, so there
3300          * is no race condition.  We will keep the lock for such a short
3301          * bit of time waking it up now or later won't matter all that much.
3302          */
3303         if (ISC_LIST_EMPTY(sock->accept_list))
3304                 do_poke = ISC_TRUE;
3305
3306         ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
3307
3308         if (do_poke)
3309                 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
3310
3311         UNLOCK(&sock->lock);
3312         return (ISC_R_SUCCESS);
3313 }
3314
3315 isc_result_t
3316 isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
3317                    isc_task_t *task, isc_taskaction_t action, const void *arg)
3318 {
3319         isc_socket_connev_t *dev;
3320         isc_task_t *ntask = NULL;
3321         isc_socketmgr_t *manager;
3322         int cc;
3323         char strbuf[ISC_STRERRORSIZE];
3324
3325         REQUIRE(VALID_SOCKET(sock));
3326         REQUIRE(addr != NULL);
3327         REQUIRE(task != NULL);
3328         REQUIRE(action != NULL);
3329
3330         manager = sock->manager;
3331         REQUIRE(VALID_MANAGER(manager));
3332         REQUIRE(addr != NULL);
3333
3334         if (isc_sockaddr_ismulticast(addr))
3335                 return (ISC_R_MULTICAST);
3336
3337         LOCK(&sock->lock);
3338
3339         REQUIRE(!sock->connecting);
3340
3341         dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
3342                                                         ISC_SOCKEVENT_CONNECT,
3343                                                         action, arg,
3344                                                         sizeof(*dev));
3345         if (dev == NULL) {
3346                 UNLOCK(&sock->lock);
3347                 return (ISC_R_NOMEMORY);
3348         }
3349         ISC_LINK_INIT(dev, ev_link);
3350
3351         /*
3352          * Try to do the connect right away, as there can be only one
3353          * outstanding, and it might happen to complete.
3354          */
3355         sock->address = *addr;
3356         cc = connect(sock->fd, &addr->type.sa, addr->length);
3357         if (cc < 0) {
3358                 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
3359                         goto queue;
3360
3361                 switch (errno) {
3362 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
3363                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
3364                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
3365                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
3366                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
3367                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
3368 #ifdef EHOSTDOWN
3369                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
3370 #endif
3371                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
3372                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
3373                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
3374                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
3375                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
3376 #undef ERROR_MATCH
3377                 }
3378
3379                 sock->connected = 0;
3380
3381                 isc__strerror(errno, strbuf, sizeof(strbuf));
3382                 UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", errno, strbuf);
3383
3384                 UNLOCK(&sock->lock);
3385                 isc_event_free(ISC_EVENT_PTR(&dev));
3386                 return (ISC_R_UNEXPECTED);
3387
3388         err_exit:
3389                 sock->connected = 0;
3390                 isc_task_send(task, ISC_EVENT_PTR(&dev));
3391
3392                 UNLOCK(&sock->lock);
3393                 return (ISC_R_SUCCESS);
3394         }
3395
3396         /*
3397          * If connect completed, fire off the done event.
3398          */
3399         if (cc == 0) {
3400                 sock->connected = 1;
3401                 sock->bound = 1;
3402                 dev->result = ISC_R_SUCCESS;
3403                 isc_task_send(task, ISC_EVENT_PTR(&dev));
3404
3405                 UNLOCK(&sock->lock);
3406                 return (ISC_R_SUCCESS);
3407         }
3408
3409  queue:
3410
3411         /*
3412          * Attach to task.
3413          */
3414         isc_task_attach(task, &ntask);
3415
3416         sock->connecting = 1;
3417
3418         dev->ev_sender = ntask;
3419
3420         /*
3421          * Poke watcher here.  We still have the socket locked, so there
3422          * is no race condition.  We will keep the lock for such a short
3423          * bit of time waking it up now or later won't matter all that much.
3424          */
3425         if (sock->connect_ev == NULL)
3426                 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
3427
3428         sock->connect_ev = dev;
3429
3430         UNLOCK(&sock->lock);
3431         return (ISC_R_SUCCESS);
3432 }
3433
3434 /*
3435  * Called when a socket with a pending connect() finishes.
3436  */
3437 static void
3438 internal_connect(isc_task_t *me, isc_event_t *ev) {
3439         isc_socket_t *sock;
3440         isc_socket_connev_t *dev;
3441         isc_task_t *task;
3442         int cc;
3443         ISC_SOCKADDR_LEN_T optlen;
3444         char strbuf[ISC_STRERRORSIZE];
3445         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
3446
3447         UNUSED(me);
3448         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3449
3450         sock = ev->ev_sender;
3451         INSIST(VALID_SOCKET(sock));
3452
3453         LOCK(&sock->lock);
3454
3455         /*
3456          * When the internal event was sent the reference count was bumped
3457          * to keep the socket around for us.  Decrement the count here.
3458          */
3459         INSIST(sock->references > 0);
3460         sock->references--;
3461         if (sock->references == 0) {
3462                 UNLOCK(&sock->lock);
3463                 destroy(&sock);
3464                 return;
3465         }
3466
3467         /*
3468          * Has this event been canceled?
3469          */
3470         dev = sock->connect_ev;
3471         if (dev == NULL) {
3472                 INSIST(!sock->connecting);
3473                 UNLOCK(&sock->lock);
3474                 return;
3475         }
3476
3477         INSIST(sock->connecting);
3478         sock->connecting = 0;
3479
3480         /*
3481          * Get any possible error status here.
3482          */
3483         optlen = sizeof(cc);
3484         if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
3485                        (void *)&cc, (void *)&optlen) < 0)
3486                 cc = errno;
3487         else
3488                 errno = cc;
3489
3490         if (errno != 0) {
3491                 /*
3492                  * If the error is EAGAIN, just re-select on this
3493                  * fd and pretend nothing strange happened.
3494                  */
3495                 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
3496                         sock->connecting = 1;
3497                         select_poke(sock->manager, sock->fd,
3498                                     SELECT_POKE_CONNECT);
3499                         UNLOCK(&sock->lock);
3500
3501                         return;
3502                 }
3503
3504                 /*
3505                  * Translate other errors into ISC_R_* flavors.
3506                  */
3507                 switch (errno) {
3508 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
3509                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
3510                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
3511                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
3512                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
3513                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
3514 #ifdef EHOSTDOWN
3515                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
3516 #endif
3517                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
3518                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
3519                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
3520                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
3521                         ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
3522                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
3523 #undef ERROR_MATCH
3524                 default:
3525                         dev->result = ISC_R_UNEXPECTED;
3526                         isc_sockaddr_format(&sock->address, peerbuf,
3527                                             sizeof(peerbuf));
3528                         isc__strerror(errno, strbuf, sizeof(strbuf));
3529                         UNEXPECTED_ERROR(__FILE__, __LINE__,
3530                                          "internal_connect: connect(%s) %s",
3531                                          peerbuf, strbuf);
3532                 }
3533         } else {
3534                 dev->result = ISC_R_SUCCESS;
3535                 sock->connected = 1;
3536                 sock->bound = 1;
3537         }
3538
3539         sock->connect_ev = NULL;
3540
3541         UNLOCK(&sock->lock);
3542
3543         task = dev->ev_sender;
3544         dev->ev_sender = sock;
3545         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
3546 }
3547
3548 isc_result_t
3549 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3550         isc_result_t result;
3551
3552         REQUIRE(VALID_SOCKET(sock));
3553         REQUIRE(addressp != NULL);
3554
3555         LOCK(&sock->lock);
3556
3557         if (sock->connected) {
3558                 *addressp = sock->address;
3559                 result = ISC_R_SUCCESS;
3560         } else {
3561                 result = ISC_R_NOTCONNECTED;
3562         }
3563
3564         UNLOCK(&sock->lock);
3565
3566         return (result);
3567 }
3568
3569 isc_result_t
3570 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3571         ISC_SOCKADDR_LEN_T len;
3572         isc_result_t result;
3573         char strbuf[ISC_STRERRORSIZE];
3574
3575         REQUIRE(VALID_SOCKET(sock));
3576         REQUIRE(addressp != NULL);
3577
3578         LOCK(&sock->lock);
3579
3580         if (!sock->bound) {
3581                 result = ISC_R_NOTBOUND;
3582                 goto out;
3583         }
3584
3585         result = ISC_R_SUCCESS;
3586
3587         len = sizeof(addressp->type);
3588         if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
3589                 isc__strerror(errno, strbuf, sizeof(strbuf));
3590                 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
3591                                  strbuf);
3592                 result = ISC_R_UNEXPECTED;
3593                 goto out;
3594         }
3595         addressp->length = (unsigned int)len;
3596
3597  out:
3598         UNLOCK(&sock->lock);
3599
3600         return (result);
3601 }
3602
3603 /*
3604  * Run through the list of events on this socket, and cancel the ones
3605  * queued for task "task" of type "how".  "how" is a bitmask.
3606  */
3607 void
3608 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
3609
3610         REQUIRE(VALID_SOCKET(sock));
3611
3612         /*
3613          * Quick exit if there is nothing to do.  Don't even bother locking
3614          * in this case.
3615          */
3616         if (how == 0)
3617                 return;
3618
3619         LOCK(&sock->lock);
3620
3621         /*
3622          * All of these do the same thing, more or less.
3623          * Each will:
3624          *      o If the internal event is marked as "posted" try to
3625          *        remove it from the task's queue.  If this fails, mark it
3626          *        as canceled instead, and let the task clean it up later.
3627          *      o For each I/O request for that task of that type, post
3628          *        its done event with status of "ISC_R_CANCELED".
3629          *      o Reset any state needed.
3630          */
3631         if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
3632             && !ISC_LIST_EMPTY(sock->recv_list)) {
3633                 isc_socketevent_t      *dev;
3634                 isc_socketevent_t      *next;
3635                 isc_task_t             *current_task;
3636
3637                 dev = ISC_LIST_HEAD(sock->recv_list);
3638
3639                 while (dev != NULL) {
3640                         current_task = dev->ev_sender;
3641                         next = ISC_LIST_NEXT(dev, ev_link);
3642
3643                         if ((task == NULL) || (task == current_task)) {
3644                                 dev->result = ISC_R_CANCELED;
3645                                 send_recvdone_event(sock, &dev);
3646                         }
3647                         dev = next;
3648                 }
3649         }
3650
3651         if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
3652             && !ISC_LIST_EMPTY(sock->send_list)) {
3653                 isc_socketevent_t      *dev;
3654                 isc_socketevent_t      *next;
3655                 isc_task_t             *current_task;
3656
3657                 dev = ISC_LIST_HEAD(sock->send_list);
3658
3659                 while (dev != NULL) {
3660                         current_task = dev->ev_sender;
3661                         next = ISC_LIST_NEXT(dev, ev_link);
3662
3663                         if ((task == NULL) || (task == current_task)) {
3664                                 dev->result = ISC_R_CANCELED;
3665                                 send_senddone_event(sock, &dev);
3666                         }
3667                         dev = next;
3668                 }
3669         }
3670
3671         if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
3672             && !ISC_LIST_EMPTY(sock->accept_list)) {
3673                 isc_socket_newconnev_t *dev;
3674                 isc_socket_newconnev_t *next;
3675                 isc_task_t             *current_task;
3676
3677                 dev = ISC_LIST_HEAD(sock->accept_list);
3678                 while (dev != NULL) {
3679                         current_task = dev->ev_sender;
3680                         next = ISC_LIST_NEXT(dev, ev_link);
3681
3682                         if ((task == NULL) || (task == current_task)) {
3683
3684                                 ISC_LIST_UNLINK(sock->accept_list, dev,
3685                                                 ev_link);
3686
3687                                 dev->newsocket->references--;
3688                                 free_socket(&dev->newsocket);
3689
3690                                 dev->result = ISC_R_CANCELED;
3691                                 dev->ev_sender = sock;
3692                                 isc_task_sendanddetach(&current_task,
3693                                                        ISC_EVENT_PTR(&dev));
3694                         }
3695
3696                         dev = next;
3697                 }
3698         }
3699
3700         /*
3701          * Connecting is not a list.
3702          */
3703         if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
3704             && sock->connect_ev != NULL) {
3705                 isc_socket_connev_t    *dev;
3706                 isc_task_t             *current_task;
3707
3708                 INSIST(sock->connecting);
3709                 sock->connecting = 0;
3710
3711                 dev = sock->connect_ev;
3712                 current_task = dev->ev_sender;
3713
3714                 if ((task == NULL) || (task == current_task)) {
3715                         sock->connect_ev = NULL;
3716
3717                         dev->result = ISC_R_CANCELED;
3718                         dev->ev_sender = sock;
3719                         isc_task_sendanddetach(&current_task,
3720                                                ISC_EVENT_PTR(&dev));
3721                 }
3722         }
3723
3724         UNLOCK(&sock->lock);
3725 }
3726
3727 isc_sockettype_t
3728 isc_socket_gettype(isc_socket_t *sock) {
3729         REQUIRE(VALID_SOCKET(sock));
3730
3731         return (sock->type);
3732 }
3733
3734 isc_boolean_t
3735 isc_socket_isbound(isc_socket_t *sock) {
3736         isc_boolean_t val;
3737
3738         LOCK(&sock->lock);
3739         val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
3740         UNLOCK(&sock->lock);
3741
3742         return (val);
3743 }
3744
3745 void
3746 isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
3747 #if defined(IPV6_V6ONLY)
3748         int onoff = yes ? 1 : 0;
3749 #else
3750         UNUSED(yes);
3751         UNUSED(sock);
3752 #endif
3753
3754         REQUIRE(VALID_SOCKET(sock));
3755
3756 #ifdef IPV6_V6ONLY
3757         if (sock->pf == AF_INET6) {
3758                 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
3759                                  (void *)&onoff, sizeof(onoff));
3760         }
3761 #endif
3762 }
3763
3764 #ifndef ISC_PLATFORM_USETHREADS
3765 void
3766 isc__socketmgr_getfdsets(fd_set *readset, fd_set *writeset, int *maxfd) {
3767         if (socketmgr == NULL)
3768                 *maxfd = 0;
3769         else {
3770                 *readset = socketmgr->read_fds;
3771                 *writeset = socketmgr->write_fds;
3772                 *maxfd = socketmgr->maxfd + 1;
3773         }
3774 }
3775
3776 isc_result_t
3777 isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd) {
3778         isc_socketmgr_t *manager = socketmgr;
3779
3780         if (manager == NULL)
3781                 return (ISC_R_NOTFOUND);
3782
3783         process_fds(manager, maxfd, readset, writeset);
3784         return (ISC_R_SUCCESS);
3785 }
3786 #endif /* ISC_PLATFORM_USETHREADS */