2 * Copyright (C) 2004-2008 Internet Systems Consortium, Inc. ("ISC")
3 * Copyright (C) 1998-2003 Internet Software Consortium.
5 * Permission to use, copy, modify, and/or distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11 * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15 * PERFORMANCE OF THIS SOFTWARE.
18 /* $Id: socket.c,v 1.237.18.29.10.6 2008/07/29 04:47:31 each Exp $ */
24 #include <sys/param.h>
25 #include <sys/types.h>
26 #include <sys/socket.h>
28 #ifdef ISC_PLATFORM_HAVESYSUNH
41 #include <isc/buffer.h>
42 #include <isc/bufferlist.h>
43 #include <isc/condition.h>
44 #include <isc/formatcheck.h>
49 #include <isc/mutex.h>
52 #include <isc/platform.h>
53 #include <isc/print.h>
54 #include <isc/region.h>
55 #include <isc/socket.h>
56 #include <isc/strerror.h>
58 #include <isc/thread.h>
61 #include "errno2result.h"
63 #ifndef ISC_PLATFORM_USETHREADS
65 #endif /* ISC_PLATFORM_USETHREADS */
68 * Max number of open sockets. In the vast majority of cases the default size
69 * of FD_SETSIZE should be fine, and this constant should be increased only
70 * when absolutely necessary and possible, i.e., the server is exhausting all
71 * available file descriptors (up to FD_SETSIZE) and the select() function
72 * and FD_xxx macros support larger values than FD_SETSIZE (which may not
73 * always by true, but we keep using some of them to ensure as much
74 * portability as possible). Note also that overall server performance
75 * may be rather worsened with a larger value of this constant due to
76 * inherent scalability problems of select().
78 * As a special note, this value shouldn't have to be touched if
79 * this is a build for an authoritative only DNS server.
82 #ifndef ISC_SOCKET_FDSETSIZE
83 #define ISC_SOCKET_FDSETSIZE FD_SETSIZE
87 * Mac OS X needs a special definition to support larger values in select()
89 #if ISC_SOCKET_FDSETSIZE > FD_SETSIZE
91 #define _DARWIN_UNLIMITED_SELECT
92 #endif /* __APPLE__ */
96 * Some systems define the socket length argument as an int, some as size_t,
97 * some as socklen_t. This is here so it can be easily changed if needed.
99 #ifndef ISC_SOCKADDR_LEN_T
100 #define ISC_SOCKADDR_LEN_T unsigned int
104 #if defined(SO_BSDCOMPAT) && defined(__linux__)
105 #include <sys/utsname.h>
109 * Define what the possible "soft" errors can be. These are non-fatal returns
110 * of various network related functions, like recv() and so on.
112 * For some reason, BSDI (and perhaps others) will sometimes return <0
113 * from recv() but will have errno==0. This is broken, but we have to
114 * work around it here.
116 #define SOFT_ERROR(e) ((e) == EAGAIN || \
117 (e) == EWOULDBLOCK || \
121 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
124 * DLVL(90) -- Function entry/exit and other tracing.
125 * DLVL(70) -- Socket "correctness" -- including returning of events, etc.
126 * DLVL(60) -- Socket data send/receive
127 * DLVL(50) -- Event tracing, including receiving/sending completion events.
128 * DLVL(20) -- Socket creation/destruction.
130 #define TRACE_LEVEL 90
131 #define CORRECTNESS_LEVEL 70
132 #define IOEVENT_LEVEL 60
133 #define EVENT_LEVEL 50
134 #define CREATION_LEVEL 20
136 #define TRACE DLVL(TRACE_LEVEL)
137 #define CORRECTNESS DLVL(CORRECTNESS_LEVEL)
138 #define IOEVENT DLVL(IOEVENT_LEVEL)
139 #define EVENT DLVL(EVENT_LEVEL)
140 #define CREATION DLVL(CREATION_LEVEL)
142 typedef isc_event_t intev_t;
144 #define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o')
145 #define VALID_SOCKET(t) ISC_MAGIC_VALID(t, SOCKET_MAGIC)
148 * IPv6 control information. If the socket is an IPv6 socket we want
149 * to collect the destination address and interface so the client can
150 * set them on outgoing packets.
152 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
159 * NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have
160 * a setsockopt() like interface to request timestamps, and if the OS
161 * doesn't do it for us, call gettimeofday() on every UDP receive?
170 * The size to raise the recieve buffer to (from BIND 8).
172 #define RCVBUFSIZE (32*1024)
175 * The number of times a send operation is repeated if the result is EINTR.
182 isc_socketmgr_t *manager;
184 isc_sockettype_t type;
186 /* Locked by socket lock. */
187 ISC_LINK(isc_socket_t) link;
188 unsigned int references;
192 ISC_LIST(isc_socketevent_t) send_list;
193 ISC_LIST(isc_socketevent_t) recv_list;
194 ISC_LIST(isc_socket_newconnev_t) accept_list;
195 isc_socket_connev_t *connect_ev;
198 * Internal events. Posted when a descriptor is readable or
199 * writable. These are statically allocated and never freed.
200 * They will be set to non-purgable before use.
205 isc_sockaddr_t address; /* remote address */
207 unsigned int pending_recv : 1,
210 listener : 1, /* listener socket */
212 connecting : 1, /* connect pending */
213 bound : 1; /* bound to local addr */
215 #ifdef ISC_NET_RECVOVERFLOW
216 unsigned char overflow; /* used for MSG_TRUNC fake */
220 ISC_SOCKADDR_LEN_T recvcmsgbuflen;
222 ISC_SOCKADDR_LEN_T sendcmsgbuflen;
225 #define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g')
226 #define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
228 struct isc_socketmgr {
235 /* Locked by manager lock. */
236 ISC_LIST(isc_socket_t) socklist;
238 fd_set *read_fds_copy;
240 fd_set *write_fds_copy;
244 int reserved; /* unlocked */
245 #ifdef ISC_PLATFORM_USETHREADS
246 isc_thread_t watcher;
247 isc_condition_t shutdown_ok;
249 #else /* ISC_PLATFORM_USETHREADS */
251 #endif /* ISC_PLATFORM_USETHREADS */
254 #ifndef ISC_PLATFORM_USETHREADS
255 static isc_socketmgr_t *socketmgr = NULL;
256 #endif /* ISC_PLATFORM_USETHREADS */
258 #define CLOSED 0 /* this one must be zero */
260 #define CLOSE_PENDING 2
263 * send() and recv() iovec counts
265 #define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER)
266 #ifdef ISC_NET_RECVOVERFLOW
267 # define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1)
269 # define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER)
272 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
273 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
274 static void free_socket(isc_socket_t **);
275 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
277 static void destroy(isc_socket_t **);
278 static void internal_accept(isc_task_t *, isc_event_t *);
279 static void internal_connect(isc_task_t *, isc_event_t *);
280 static void internal_recv(isc_task_t *, isc_event_t *);
281 static void internal_send(isc_task_t *, isc_event_t *);
282 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
283 static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
284 struct msghdr *, struct iovec *, size_t *);
285 static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
286 struct msghdr *, struct iovec *, size_t *);
287 static void cleanup_fdsets(isc_socketmgr_t *, isc_mem_t *);
288 static isc_result_t create_fdsets(isc_socketmgr_t *, isc_mem_t *);
290 #define SELECT_POKE_SHUTDOWN (-1)
291 #define SELECT_POKE_NOTHING (-2)
292 #define SELECT_POKE_READ (-3)
293 #define SELECT_POKE_ACCEPT (-3) /*%< Same as _READ */
294 #define SELECT_POKE_WRITE (-4)
295 #define SELECT_POKE_CONNECT (-4) /*%< Same as _WRITE */
296 #define SELECT_POKE_CLOSE (-5)
298 #define SOCK_DEAD(s) ((s)->references == 0)
301 manager_log(isc_socketmgr_t *sockmgr,
302 isc_logcategory_t *category, isc_logmodule_t *module, int level,
303 const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
305 manager_log(isc_socketmgr_t *sockmgr,
306 isc_logcategory_t *category, isc_logmodule_t *module, int level,
307 const char *fmt, ...)
312 if (! isc_log_wouldlog(isc_lctx, level))
316 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
319 isc_log_write(isc_lctx, category, module, level,
320 "sockmgr %p: %s", sockmgr, msgbuf);
324 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
325 isc_logcategory_t *category, isc_logmodule_t *module, int level,
326 isc_msgcat_t *msgcat, int msgset, int message,
327 const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
329 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
330 isc_logcategory_t *category, isc_logmodule_t *module, int level,
331 isc_msgcat_t *msgcat, int msgset, int message,
332 const char *fmt, ...)
335 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
338 if (! isc_log_wouldlog(isc_lctx, level))
342 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
345 if (address == NULL) {
346 isc_log_iwrite(isc_lctx, category, module, level,
347 msgcat, msgset, message,
348 "socket %p: %s", sock, msgbuf);
350 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
351 isc_log_iwrite(isc_lctx, category, module, level,
352 msgcat, msgset, message,
353 "socket %p %s: %s", sock, peerbuf, msgbuf);
358 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
362 * This is a wakeup on a socket. If the socket is not in the
363 * process of being closed, start watching it for either reads
367 INSIST(fd >= 0 && fd < manager->fdsize);
369 if (manager->fdstate[fd] == CLOSE_PENDING) {
370 manager->fdstate[fd] = CLOSED;
371 FD_CLR(fd, manager->read_fds);
372 FD_CLR(fd, manager->write_fds);
376 if (manager->fdstate[fd] != MANAGED)
379 sock = manager->fds[fd];
384 if (msg == SELECT_POKE_READ)
385 FD_SET(sock->fd, manager->read_fds);
386 if (msg == SELECT_POKE_WRITE)
387 FD_SET(sock->fd, manager->write_fds);
390 #ifdef ISC_PLATFORM_USETHREADS
392 * Poke the select loop when there is something for us to do.
393 * The write is required (by POSIX) to complete. That is, we
394 * will not get partial writes.
397 select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
400 char strbuf[ISC_STRERRORSIZE];
406 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
409 * Treat ENOSR as EAGAIN but loop slowly as it is
410 * unlikely to clear fast.
412 if (cc < 0 && errno == ENOSR) {
417 } while (cc < 0 && SOFT_ERROR(errno));
420 isc__strerror(errno, strbuf, sizeof(strbuf));
421 FATAL_ERROR(__FILE__, __LINE__,
422 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
425 "during watcher poke: %s"),
429 INSIST(cc == sizeof(buf));
433 * Read a message on the internal fd.
436 select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
439 char strbuf[ISC_STRERRORSIZE];
441 cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
443 *msg = SELECT_POKE_NOTHING;
444 *fd = -1; /* Silence compiler. */
445 if (SOFT_ERROR(errno))
448 isc__strerror(errno, strbuf, sizeof(strbuf));
449 FATAL_ERROR(__FILE__, __LINE__,
450 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
453 "during watcher poke: %s"),
458 INSIST(cc == sizeof(buf));
463 #else /* ISC_PLATFORM_USETHREADS */
465 * Update the state of the socketmgr when something changes.
468 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
469 if (msg == SELECT_POKE_SHUTDOWN)
472 wakeup_socket(manager, fd, msg);
475 #endif /* ISC_PLATFORM_USETHREADS */
478 * Make a fd non-blocking.
481 make_nonblock(int fd) {
484 char strbuf[ISC_STRERRORSIZE];
485 #ifdef USE_FIONBIO_IOCTL
488 ret = ioctl(fd, FIONBIO, (char *)&on);
490 flags = fcntl(fd, F_GETFL, 0);
491 flags |= PORT_NONBLOCK;
492 ret = fcntl(fd, F_SETFL, flags);
496 isc__strerror(errno, strbuf, sizeof(strbuf));
497 UNEXPECTED_ERROR(__FILE__, __LINE__,
498 #ifdef USE_FIONBIO_IOCTL
499 "ioctl(%d, FIONBIO, &on): %s", fd,
501 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
505 return (ISC_R_UNEXPECTED);
508 return (ISC_R_SUCCESS);
513 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
514 * In order to ensure as much portability as possible, we provide wrapper
515 * functions of these macros.
516 * Note that cmsg_space() could run slow on OSes that do not have
519 static inline ISC_SOCKADDR_LEN_T
520 cmsg_len(ISC_SOCKADDR_LEN_T len) {
522 return (CMSG_LEN(len));
524 ISC_SOCKADDR_LEN_T hdrlen;
527 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
530 hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
531 return (hdrlen + len);
535 static inline ISC_SOCKADDR_LEN_T
536 cmsg_space(ISC_SOCKADDR_LEN_T len) {
538 return (CMSG_SPACE(len));
541 struct cmsghdr *cmsgp;
543 * XXX: The buffer length is an ad-hoc value, but should be enough
544 * in a practical sense.
546 char dummybuf[sizeof(struct cmsghdr) + 1024];
548 memset(&msg, 0, sizeof(msg));
549 msg.msg_control = dummybuf;
550 msg.msg_controllen = sizeof(dummybuf);
552 cmsgp = (struct cmsghdr *)dummybuf;
553 cmsgp->cmsg_len = cmsg_len(len);
555 cmsgp = CMSG_NXTHDR(&msg, cmsgp);
557 return ((char *)cmsgp - (char *)msg.msg_control);
562 #endif /* USE_CMSG */
565 * Process control messages received on a socket.
568 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
570 struct cmsghdr *cmsgp;
571 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
572 struct in6_pktinfo *pktinfop;
575 struct timeval *timevalp;
580 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
581 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
582 * They are all here, outside of the CPP tests, because it is
583 * more consistent with the usual ISC coding style.
589 #ifdef ISC_NET_BSD44MSGHDR
592 if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
593 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
597 if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
598 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
604 if (msg->msg_controllen == 0U || msg->msg_control == NULL)
610 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
614 cmsgp = CMSG_FIRSTHDR(msg);
615 while (cmsgp != NULL) {
616 socket_log(sock, NULL, TRACE,
617 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
618 "processing cmsg %p", cmsgp);
620 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
621 if (cmsgp->cmsg_level == IPPROTO_IPV6
622 && cmsgp->cmsg_type == IPV6_PKTINFO) {
624 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
625 memcpy(&dev->pktinfo, pktinfop,
626 sizeof(struct in6_pktinfo));
627 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
628 socket_log(sock, NULL, TRACE,
629 isc_msgcat, ISC_MSGSET_SOCKET,
631 "interface received on ifindex %u",
632 dev->pktinfo.ipi6_ifindex);
633 if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
634 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
640 if (cmsgp->cmsg_level == SOL_SOCKET
641 && cmsgp->cmsg_type == SCM_TIMESTAMP) {
642 timevalp = (struct timeval *)CMSG_DATA(cmsgp);
643 dev->timestamp.seconds = timevalp->tv_sec;
644 dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
645 dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
651 cmsgp = CMSG_NXTHDR(msg, cmsgp);
653 #endif /* USE_CMSG */
655 #endif /* ISC_NET_BSD44MSGHDR */
659 * Construct an iov array and attach it to the msghdr passed in. This is
660 * the SEND constructor, which will use the used region of the buffer
661 * (if using a buffer list) or will use the internal region (if a single
662 * buffer I/O is requested).
664 * Nothing can be NULL, and the done event must list at least one buffer
665 * on the buffer linked list for this function to be meaningful.
667 * If write_countp != NULL, *write_countp will hold the number of bytes
668 * this transaction can send.
671 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
672 struct msghdr *msg, struct iovec *iov, size_t *write_countp)
674 unsigned int iovcount;
675 isc_buffer_t *buffer;
680 memset(msg, 0, sizeof(*msg));
682 if (sock->type == isc_sockettype_udp) {
683 msg->msg_name = (void *)&dev->address.type.sa;
684 msg->msg_namelen = dev->address.length;
686 msg->msg_name = NULL;
687 msg->msg_namelen = 0;
690 buffer = ISC_LIST_HEAD(dev->bufferlist);
695 * Single buffer I/O? Skip what we've done so far in this region.
697 if (buffer == NULL) {
698 write_count = dev->region.length - dev->n;
699 iov[0].iov_base = (void *)(dev->region.base + dev->n);
700 iov[0].iov_len = write_count;
708 * Skip the data in the buffer list that we have already written.
711 while (buffer != NULL) {
712 REQUIRE(ISC_BUFFER_VALID(buffer));
713 if (skip_count < isc_buffer_usedlength(buffer))
715 skip_count -= isc_buffer_usedlength(buffer);
716 buffer = ISC_LIST_NEXT(buffer, link);
719 while (buffer != NULL) {
720 INSIST(iovcount < MAXSCATTERGATHER_SEND);
722 isc_buffer_usedregion(buffer, &used);
724 if (used.length > 0) {
725 iov[iovcount].iov_base = (void *)(used.base
727 iov[iovcount].iov_len = used.length - skip_count;
728 write_count += (used.length - skip_count);
732 buffer = ISC_LIST_NEXT(buffer, link);
735 INSIST(skip_count == 0U);
739 msg->msg_iovlen = iovcount;
741 #ifdef ISC_NET_BSD44MSGHDR
742 msg->msg_control = NULL;
743 msg->msg_controllen = 0;
745 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
746 if ((sock->type == isc_sockettype_udp)
747 && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
748 struct cmsghdr *cmsgp;
749 struct in6_pktinfo *pktinfop;
751 socket_log(sock, NULL, TRACE,
752 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
753 "sendto pktinfo data, ifindex %u",
754 dev->pktinfo.ipi6_ifindex);
756 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
757 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
758 msg->msg_control = (void *)sock->sendcmsgbuf;
760 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
761 cmsgp->cmsg_level = IPPROTO_IPV6;
762 cmsgp->cmsg_type = IPV6_PKTINFO;
763 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
764 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
765 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
767 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
768 #else /* ISC_NET_BSD44MSGHDR */
769 msg->msg_accrights = NULL;
770 msg->msg_accrightslen = 0;
771 #endif /* ISC_NET_BSD44MSGHDR */
773 if (write_countp != NULL)
774 *write_countp = write_count;
778 * Construct an iov array and attach it to the msghdr passed in. This is
779 * the RECV constructor, which will use the avialable region of the buffer
780 * (if using a buffer list) or will use the internal region (if a single
781 * buffer I/O is requested).
783 * Nothing can be NULL, and the done event must list at least one buffer
784 * on the buffer linked list for this function to be meaningful.
786 * If read_countp != NULL, *read_countp will hold the number of bytes
787 * this transaction can receive.
790 build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
791 struct msghdr *msg, struct iovec *iov, size_t *read_countp)
793 unsigned int iovcount;
794 isc_buffer_t *buffer;
795 isc_region_t available;
798 memset(msg, 0, sizeof(struct msghdr));
800 if (sock->type == isc_sockettype_udp) {
801 memset(&dev->address, 0, sizeof(dev->address));
802 #ifdef BROKEN_RECVMSG
803 if (sock->pf == AF_INET) {
804 msg->msg_name = (void *)&dev->address.type.sin;
805 msg->msg_namelen = sizeof(dev->address.type.sin6);
806 } else if (sock->pf == AF_INET6) {
807 msg->msg_name = (void *)&dev->address.type.sin6;
808 msg->msg_namelen = sizeof(dev->address.type.sin6);
809 #ifdef ISC_PLATFORM_HAVESYSUNH
810 } else if (sock->pf == AF_UNIX) {
811 msg->msg_name = (void *)&dev->address.type.sunix;
812 msg->msg_namelen = sizeof(dev->address.type.sunix);
815 msg->msg_name = (void *)&dev->address.type.sa;
816 msg->msg_namelen = sizeof(dev->address.type);
819 msg->msg_name = (void *)&dev->address.type.sa;
820 msg->msg_namelen = sizeof(dev->address.type);
822 #ifdef ISC_NET_RECVOVERFLOW
823 /* If needed, steal one iovec for overflow detection. */
827 msg->msg_name = NULL;
828 msg->msg_namelen = 0;
829 dev->address = sock->address;
832 buffer = ISC_LIST_HEAD(dev->bufferlist);
836 * Single buffer I/O? Skip what we've done so far in this region.
838 if (buffer == NULL) {
839 read_count = dev->region.length - dev->n;
840 iov[0].iov_base = (void *)(dev->region.base + dev->n);
841 iov[0].iov_len = read_count;
849 * Skip empty buffers.
851 while (buffer != NULL) {
852 REQUIRE(ISC_BUFFER_VALID(buffer));
853 if (isc_buffer_availablelength(buffer) != 0)
855 buffer = ISC_LIST_NEXT(buffer, link);
859 while (buffer != NULL) {
860 INSIST(iovcount < MAXSCATTERGATHER_RECV);
862 isc_buffer_availableregion(buffer, &available);
864 if (available.length > 0) {
865 iov[iovcount].iov_base = (void *)(available.base);
866 iov[iovcount].iov_len = available.length;
867 read_count += available.length;
870 buffer = ISC_LIST_NEXT(buffer, link);
876 * If needed, set up to receive that one extra byte. Note that
877 * we know there is at least one iov left, since we stole it
878 * at the top of this function.
880 #ifdef ISC_NET_RECVOVERFLOW
881 if (sock->type == isc_sockettype_udp) {
882 iov[iovcount].iov_base = (void *)(&sock->overflow);
883 iov[iovcount].iov_len = 1;
889 msg->msg_iovlen = iovcount;
891 #ifdef ISC_NET_BSD44MSGHDR
892 msg->msg_control = NULL;
893 msg->msg_controllen = 0;
895 #if defined(USE_CMSG)
896 if (sock->type == isc_sockettype_udp) {
897 msg->msg_control = sock->recvcmsgbuf;
898 msg->msg_controllen = sock->recvcmsgbuflen;
900 #endif /* USE_CMSG */
901 #else /* ISC_NET_BSD44MSGHDR */
902 msg->msg_accrights = NULL;
903 msg->msg_accrightslen = 0;
904 #endif /* ISC_NET_BSD44MSGHDR */
906 if (read_countp != NULL)
907 *read_countp = read_count;
911 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
912 isc_socketevent_t *dev)
914 if (sock->type == isc_sockettype_udp) {
916 dev->address = *address;
918 dev->address = sock->address;
919 } else if (sock->type == isc_sockettype_tcp) {
920 INSIST(address == NULL);
921 dev->address = sock->address;
926 destroy_socketevent(isc_event_t *event) {
927 isc_socketevent_t *ev = (isc_socketevent_t *)event;
929 INSIST(ISC_LIST_EMPTY(ev->bufferlist));
931 (ev->destroy)(event);
934 static isc_socketevent_t *
935 allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
936 isc_taskaction_t action, const void *arg)
938 isc_socketevent_t *ev;
940 ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
948 ev->result = ISC_R_UNEXPECTED;
949 ISC_LINK_INIT(ev, ev_link);
950 ISC_LIST_INIT(ev->bufferlist);
951 ev->region.base = NULL;
955 ev->destroy = ev->ev_destroy;
956 ev->ev_destroy = destroy_socketevent;
961 #if defined(ISC_SOCKET_DEBUG)
963 dump_msg(struct msghdr *msg) {
966 printf("MSGHDR %p\n", msg);
967 printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
968 printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
969 for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
970 printf("\t\t%d\tbase %p, len %d\n", i,
971 msg->msg_iov[i].iov_base,
972 msg->msg_iov[i].iov_len);
973 #ifdef ISC_NET_BSD44MSGHDR
974 printf("\tcontrol %p, controllen %d\n", msg->msg_control,
975 msg->msg_controllen);
980 #define DOIO_SUCCESS 0 /* i/o ok, event sent */
981 #define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */
982 #define DOIO_HARD 2 /* i/o error, event sent */
983 #define DOIO_EOF 3 /* EOF, no event sent */
986 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
988 struct iovec iov[MAXSCATTERGATHER_RECV];
991 struct msghdr msghdr;
992 isc_buffer_t *buffer;
994 char strbuf[ISC_STRERRORSIZE];
996 build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
998 #if defined(ISC_SOCKET_DEBUG)
1002 cc = recvmsg(sock->fd, &msghdr, 0);
1005 #if defined(ISC_SOCKET_DEBUG)
1010 if (SOFT_ERROR(recv_errno))
1013 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1014 isc__strerror(recv_errno, strbuf, sizeof(strbuf));
1015 socket_log(sock, NULL, IOEVENT,
1016 isc_msgcat, ISC_MSGSET_SOCKET,
1018 "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1019 sock->fd, cc, recv_errno, strbuf);
1022 #define SOFT_OR_HARD(_system, _isc) \
1023 if (recv_errno == _system) { \
1024 if (sock->connected) { \
1025 dev->result = _isc; \
1026 return (DOIO_HARD); \
1028 return (DOIO_SOFT); \
1030 #define ALWAYS_HARD(_system, _isc) \
1031 if (recv_errno == _system) { \
1032 dev->result = _isc; \
1033 return (DOIO_HARD); \
1036 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1037 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1038 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1039 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1040 /* HPUX 11.11 can return EADDRNOTAVAIL. */
1041 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1042 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1047 dev->result = isc__errno2result(recv_errno);
1052 * On TCP, zero length reads indicate EOF, while on
1053 * UDP, zero length reads are perfectly valid, although
1056 if ((sock->type == isc_sockettype_tcp) && (cc == 0))
1059 if (sock->type == isc_sockettype_udp) {
1060 dev->address.length = msghdr.msg_namelen;
1061 if (isc_sockaddr_getport(&dev->address) == 0) {
1062 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1063 socket_log(sock, &dev->address, IOEVENT,
1064 isc_msgcat, ISC_MSGSET_SOCKET,
1066 "dropping source port zero packet");
1072 socket_log(sock, &dev->address, IOEVENT,
1073 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1074 "packet received correctly");
1077 * Overflow bit detection. If we received MORE bytes than we should,
1078 * this indicates an overflow situation. Set the flag in the
1079 * dev entry and adjust how much we read by one.
1081 #ifdef ISC_NET_RECVOVERFLOW
1082 if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1083 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1089 * If there are control messages attached, run through them and pull
1090 * out the interesting bits.
1092 if (sock->type == isc_sockettype_udp)
1093 process_cmsg(sock, &msghdr, dev);
1096 * update the buffers (if any) and the i/o count
1100 buffer = ISC_LIST_HEAD(dev->bufferlist);
1101 while (buffer != NULL && actual_count > 0U) {
1102 REQUIRE(ISC_BUFFER_VALID(buffer));
1103 if (isc_buffer_availablelength(buffer) <= actual_count) {
1104 actual_count -= isc_buffer_availablelength(buffer);
1105 isc_buffer_add(buffer,
1106 isc_buffer_availablelength(buffer));
1108 isc_buffer_add(buffer, actual_count);
1112 buffer = ISC_LIST_NEXT(buffer, link);
1113 if (buffer == NULL) {
1114 INSIST(actual_count == 0U);
1119 * If we read less than we expected, update counters,
1120 * and let the upper layer poke the descriptor.
1122 if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1126 * Full reads are posted, or partials if partials are ok.
1128 dev->result = ISC_R_SUCCESS;
1129 return (DOIO_SUCCESS);
1134 * DOIO_SUCCESS The operation succeeded. dev->result contains
1137 * DOIO_HARD A hard or unexpected I/O error was encountered.
1138 * dev->result contains the appropriate error.
1140 * DOIO_SOFT A soft I/O error was encountered. No senddone
1141 * event was sent. The operation should be retried.
1143 * No other return values are possible.
1146 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1148 struct iovec iov[MAXSCATTERGATHER_SEND];
1150 struct msghdr msghdr;
1151 char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1154 char strbuf[ISC_STRERRORSIZE];
1156 build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1159 cc = sendmsg(sock->fd, &msghdr, 0);
1163 * Check for error or block condition.
1166 if (send_errno == EINTR && ++attempts < NRETRIES)
1169 if (SOFT_ERROR(send_errno))
1172 #define SOFT_OR_HARD(_system, _isc) \
1173 if (send_errno == _system) { \
1174 if (sock->connected) { \
1175 dev->result = _isc; \
1176 return (DOIO_HARD); \
1178 return (DOIO_SOFT); \
1180 #define ALWAYS_HARD(_system, _isc) \
1181 if (send_errno == _system) { \
1182 dev->result = _isc; \
1183 return (DOIO_HARD); \
1186 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1187 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1188 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1189 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1190 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1192 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1194 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1195 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1196 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1197 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1198 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1204 * The other error types depend on whether or not the
1205 * socket is UDP or TCP. If it is UDP, some errors
1206 * that we expect to be fatal under TCP are merely
1207 * annoying, and are really soft errors.
1209 * However, these soft errors are still returned as
1212 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1213 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1214 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1216 dev->result = isc__errno2result(send_errno);
1221 UNEXPECTED_ERROR(__FILE__, __LINE__,
1222 "internal_send: send() %s 0",
1223 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1224 ISC_MSG_RETURNED, "returned"));
1227 * If we write less than we expected, update counters, poke.
1230 if ((size_t)cc != write_count)
1234 * Exactly what we wanted to write. We're done with this
1235 * entry. Post its completion event.
1237 dev->result = ISC_R_SUCCESS;
1238 return (DOIO_SUCCESS);
1244 * Caller must ensure that the socket is not locked and no external
1248 destroy(isc_socket_t **sockp) {
1249 isc_socket_t *sock = *sockp;
1250 isc_socketmgr_t *manager = sock->manager;
1252 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1253 ISC_MSG_DESTROYING, "destroying");
1255 INSIST(ISC_LIST_EMPTY(sock->accept_list));
1256 INSIST(ISC_LIST_EMPTY(sock->recv_list));
1257 INSIST(ISC_LIST_EMPTY(sock->send_list));
1258 INSIST(sock->connect_ev == NULL);
1259 REQUIRE(sock->fd >= 0 && sock->fd < (int)manager->fdsize);
1261 LOCK(&manager->lock);
1264 * No one has this socket open, so the watcher doesn't have to be
1265 * poked, and the socket doesn't have to be locked.
1267 manager->fds[sock->fd] = NULL;
1268 manager->fdstate[sock->fd] = CLOSE_PENDING;
1269 select_poke(manager, sock->fd, SELECT_POKE_CLOSE);
1270 ISC_LIST_UNLINK(manager->socklist, sock, link);
1272 #ifdef ISC_PLATFORM_USETHREADS
1273 if (ISC_LIST_EMPTY(manager->socklist))
1274 SIGNAL(&manager->shutdown_ok);
1275 #endif /* ISC_PLATFORM_USETHREADS */
1278 * XXX should reset manager->maxfd here
1281 UNLOCK(&manager->lock);
1287 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1288 isc_socket_t **socketp)
1291 isc_result_t result;
1292 ISC_SOCKADDR_LEN_T cmsgbuflen;
1294 sock = isc_mem_get(manager->mctx, sizeof(*sock));
1297 return (ISC_R_NOMEMORY);
1299 result = ISC_R_UNEXPECTED;
1302 sock->references = 0;
1304 sock->manager = manager;
1308 ISC_LINK_INIT(sock, link);
1310 sock->recvcmsgbuf = NULL;
1311 sock->sendcmsgbuf = NULL;
1314 * set up cmsg buffers
1317 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1318 cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1320 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
1321 cmsgbuflen += cmsg_space(sizeof(struct timeval));
1323 sock->recvcmsgbuflen = cmsgbuflen;
1324 if (sock->recvcmsgbuflen != 0U) {
1325 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1326 if (sock->recvcmsgbuf == NULL)
1331 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1332 cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1334 sock->sendcmsgbuflen = cmsgbuflen;
1335 if (sock->sendcmsgbuflen != 0U) {
1336 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1337 if (sock->sendcmsgbuf == NULL)
1342 * set up list of readers and writers to be initially empty
1344 ISC_LIST_INIT(sock->recv_list);
1345 ISC_LIST_INIT(sock->send_list);
1346 ISC_LIST_INIT(sock->accept_list);
1347 sock->connect_ev = NULL;
1348 sock->pending_recv = 0;
1349 sock->pending_send = 0;
1350 sock->pending_accept = 0;
1352 sock->connected = 0;
1353 sock->connecting = 0;
1357 * initialize the lock
1359 result = isc_mutex_init(&sock->lock);
1360 if (result != ISC_R_SUCCESS) {
1366 * Initialize readable and writable events
1368 ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1369 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1370 NULL, sock, sock, NULL, NULL);
1371 ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1372 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1373 NULL, sock, sock, NULL, NULL);
1375 sock->magic = SOCKET_MAGIC;
1378 return (ISC_R_SUCCESS);
1381 if (sock->recvcmsgbuf != NULL)
1382 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
1383 sock->recvcmsgbuflen);
1384 if (sock->sendcmsgbuf != NULL)
1385 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
1386 sock->sendcmsgbuflen);
1387 isc_mem_put(manager->mctx, sock, sizeof(*sock));
1393 * This event requires that the various lists be empty, that the reference
1394 * count be 1, and that the magic number is valid. The other socket bits,
1395 * like the lock, must be initialized as well. The fd associated must be
1396 * marked as closed, by setting it to -1 on close, or this routine will
1397 * also close the socket.
1400 free_socket(isc_socket_t **socketp) {
1401 isc_socket_t *sock = *socketp;
1403 INSIST(sock->references == 0);
1404 INSIST(VALID_SOCKET(sock));
1405 INSIST(!sock->connecting);
1406 INSIST(!sock->pending_recv);
1407 INSIST(!sock->pending_send);
1408 INSIST(!sock->pending_accept);
1409 INSIST(ISC_LIST_EMPTY(sock->recv_list));
1410 INSIST(ISC_LIST_EMPTY(sock->send_list));
1411 INSIST(ISC_LIST_EMPTY(sock->accept_list));
1412 INSIST(!ISC_LINK_LINKED(sock, link));
1414 if (sock->recvcmsgbuf != NULL)
1415 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
1416 sock->recvcmsgbuflen);
1417 if (sock->sendcmsgbuf != NULL)
1418 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
1419 sock->sendcmsgbuflen);
1423 DESTROYLOCK(&sock->lock);
1425 isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1432 * This really should not be necessary to do. Having to workout
1433 * which kernel version we are on at run time so that we don't cause
1434 * the kernel to issue a warning about us using a deprecated socket option.
1435 * Such warnings should *never* be on by default in production kernels.
1437 * We can't do this a build time because executables are moved between
1438 * machines and hence kernels.
1440 * We can't just not set SO_BSDCOMAT because some kernels require it.
1443 static isc_once_t bsdcompat_once = ISC_ONCE_INIT;
1444 isc_boolean_t bsdcompat = ISC_TRUE;
1447 clear_bsdcompat(void) {
1454 uname(&buf); /* Can only fail if buf is bad in Linux. */
1456 /* Paranoia in parsing can be increased, but we trust uname(). */
1457 major = strtol(buf.release, &endp, 10);
1459 minor = strtol(endp+1, &endp, 10);
1460 if ((major > 2) || ((major == 2) && (minor >= 4))) {
1461 bsdcompat = ISC_FALSE;
1464 #endif /* __linux __ */
1469 * Create a new 'type' socket managed by 'manager'. Events
1470 * will be posted to 'task' and when dispatched 'action' will be
1471 * called with 'arg' as the arg value. The new socket is returned
1475 isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1476 isc_socket_t **socketp)
1478 isc_socket_t *sock = NULL;
1479 isc_result_t result;
1480 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
1483 #if defined(SO_RCVBUF)
1484 ISC_SOCKADDR_LEN_T optlen;
1487 char strbuf[ISC_STRERRORSIZE];
1488 const char *err = "socket";
1491 REQUIRE(VALID_MANAGER(manager));
1492 REQUIRE(socketp != NULL && *socketp == NULL);
1494 result = allocate_socket(manager, type, &sock);
1495 if (result != ISC_R_SUCCESS)
1501 case isc_sockettype_udp:
1502 sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1504 case isc_sockettype_tcp:
1505 sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1507 case isc_sockettype_unix:
1508 sock->fd = socket(pf, SOCK_STREAM, 0);
1511 if (sock->fd == -1 && errno == EINTR && try++ < 42)
1516 * Leave a space for stdio and TCP to work in.
1518 if (manager->reserved != 0 && type == isc_sockettype_udp &&
1519 sock->fd >= 0 && sock->fd < manager->reserved) {
1521 new = fcntl(sock->fd, F_DUPFD, manager->reserved);
1523 (void)close(sock->fd);
1526 err = "isc_socket_create: fcntl/reserved";
1527 } else if (sock->fd >= 0 && sock->fd < 20) {
1529 new = fcntl(sock->fd, F_DUPFD, 20);
1531 (void)close(sock->fd);
1534 err = "isc_socket_create: fcntl";
1538 if (sock->fd >= (int)manager->fdsize) {
1539 (void)close(sock->fd);
1540 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1541 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1542 isc_msgcat, ISC_MSGSET_SOCKET,
1544 "%s: too many open file descriptors", "socket");
1546 return (ISC_R_NORESOURCES);
1556 return (ISC_R_NORESOURCES);
1558 case EPROTONOSUPPORT:
1562 * Linux 2.2 (and maybe others) return EINVAL instead of
1566 return (ISC_R_FAMILYNOSUPPORT);
1569 isc__strerror(errno, strbuf, sizeof(strbuf));
1570 UNEXPECTED_ERROR(__FILE__, __LINE__,
1572 isc_msgcat_get(isc_msgcat,
1577 return (ISC_R_UNEXPECTED);
1581 if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
1582 (void)close(sock->fd);
1584 return (ISC_R_UNEXPECTED);
1588 RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
1589 clear_bsdcompat) == ISC_R_SUCCESS);
1590 if (type != isc_sockettype_unix && bsdcompat &&
1591 setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
1592 (void *)&on, sizeof(on)) < 0) {
1593 isc__strerror(errno, strbuf, sizeof(strbuf));
1594 UNEXPECTED_ERROR(__FILE__, __LINE__,
1595 "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
1597 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1598 ISC_MSG_FAILED, "failed"),
1604 #if defined(USE_CMSG) || defined(SO_RCVBUF)
1605 if (type == isc_sockettype_udp) {
1607 #if defined(USE_CMSG)
1608 #if defined(SO_TIMESTAMP)
1609 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1610 (void *)&on, sizeof(on)) < 0
1611 && errno != ENOPROTOOPT) {
1612 isc__strerror(errno, strbuf, sizeof(strbuf));
1613 UNEXPECTED_ERROR(__FILE__, __LINE__,
1614 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1616 isc_msgcat_get(isc_msgcat,
1623 #endif /* SO_TIMESTAMP */
1625 #if defined(ISC_PLATFORM_HAVEIPV6)
1626 if (pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
1628 * Warn explicitly because this anomaly can be hidden
1629 * in usual operation (and unexpectedly appear later).
1631 UNEXPECTED_ERROR(__FILE__, __LINE__,
1632 "No buffer available to receive "
1633 "IPv6 destination");
1635 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1636 #ifdef IPV6_RECVPKTINFO
1638 if ((pf == AF_INET6)
1639 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1640 (void *)&on, sizeof(on)) < 0)) {
1641 isc__strerror(errno, strbuf, sizeof(strbuf));
1642 UNEXPECTED_ERROR(__FILE__, __LINE__,
1643 "setsockopt(%d, IPV6_RECVPKTINFO) "
1645 isc_msgcat_get(isc_msgcat,
1653 if ((pf == AF_INET6)
1654 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1655 (void *)&on, sizeof(on)) < 0)) {
1656 isc__strerror(errno, strbuf, sizeof(strbuf));
1657 UNEXPECTED_ERROR(__FILE__, __LINE__,
1658 "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1660 isc_msgcat_get(isc_msgcat,
1666 #endif /* IPV6_RECVPKTINFO */
1667 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
1668 #ifdef IPV6_USE_MIN_MTU /* RFC 3542, not too common yet*/
1669 /* use minimum MTU */
1670 if (pf == AF_INET6) {
1671 (void)setsockopt(sock->fd, IPPROTO_IPV6,
1673 (void *)&on, sizeof(on));
1676 #endif /* ISC_PLATFORM_HAVEIPV6 */
1677 #endif /* defined(USE_CMSG) */
1679 #if defined(SO_RCVBUF)
1680 optlen = sizeof(size);
1681 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1682 (void *)&size, &optlen) >= 0 &&
1683 size < RCVBUFSIZE) {
1685 if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1686 (void *)&size, sizeof(size)) == -1) {
1687 isc__strerror(errno, strbuf, sizeof(strbuf));
1688 UNEXPECTED_ERROR(__FILE__, __LINE__,
1689 "setsockopt(%d, SO_RCVBUF, %d) %s: %s",
1691 isc_msgcat_get(isc_msgcat,
1700 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
1702 sock->references = 1;
1705 LOCK(&manager->lock);
1708 * Note we don't have to lock the socket like we normally would because
1709 * there are no external references to it yet.
1712 manager->fds[sock->fd] = sock;
1713 manager->fdstate[sock->fd] = MANAGED;
1714 ISC_LIST_APPEND(manager->socklist, sock, link);
1715 if (manager->maxfd < sock->fd)
1716 manager->maxfd = sock->fd;
1718 UNLOCK(&manager->lock);
1720 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1721 ISC_MSG_CREATED, "created");
1723 return (ISC_R_SUCCESS);
1727 * Attach to a socket. Caller must explicitly detach when it is done.
1730 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1731 REQUIRE(VALID_SOCKET(sock));
1732 REQUIRE(socketp != NULL && *socketp == NULL);
1736 UNLOCK(&sock->lock);
1742 * Dereference a socket. If this is the last reference to it, clean things
1743 * up by destroying the socket.
1746 isc_socket_detach(isc_socket_t **socketp) {
1748 isc_boolean_t kill_socket = ISC_FALSE;
1750 REQUIRE(socketp != NULL);
1752 REQUIRE(VALID_SOCKET(sock));
1755 REQUIRE(sock->references > 0);
1757 if (sock->references == 0)
1758 kill_socket = ISC_TRUE;
1759 UNLOCK(&sock->lock);
1768 * I/O is possible on a given socket. Schedule an event to this task that
1769 * will call an internal function to do the I/O. This will charge the
1770 * task with the I/O operation and let our select loop handler get back
1771 * to doing something real as fast as possible.
1773 * The socket and manager must be locked before calling this function.
1776 dispatch_recv(isc_socket_t *sock) {
1778 isc_socketevent_t *ev;
1780 INSIST(!sock->pending_recv);
1782 ev = ISC_LIST_HEAD(sock->recv_list);
1786 sock->pending_recv = 1;
1787 iev = &sock->readable_ev;
1789 socket_log(sock, NULL, EVENT, NULL, 0, 0,
1790 "dispatch_recv: event %p -> task %p", ev, ev->ev_sender);
1793 iev->ev_sender = sock;
1794 iev->ev_action = internal_recv;
1797 isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1801 dispatch_send(isc_socket_t *sock) {
1803 isc_socketevent_t *ev;
1805 INSIST(!sock->pending_send);
1807 ev = ISC_LIST_HEAD(sock->send_list);
1811 sock->pending_send = 1;
1812 iev = &sock->writable_ev;
1814 socket_log(sock, NULL, EVENT, NULL, 0, 0,
1815 "dispatch_send: event %p -> task %p", ev, ev->ev_sender);
1818 iev->ev_sender = sock;
1819 iev->ev_action = internal_send;
1822 isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1826 * Dispatch an internal accept event.
1829 dispatch_accept(isc_socket_t *sock) {
1831 isc_socket_newconnev_t *ev;
1833 INSIST(!sock->pending_accept);
1836 * Are there any done events left, or were they all canceled
1837 * before the manager got the socket lock?
1839 ev = ISC_LIST_HEAD(sock->accept_list);
1843 sock->pending_accept = 1;
1844 iev = &sock->readable_ev;
1846 sock->references++; /* keep socket around for this internal event */
1847 iev->ev_sender = sock;
1848 iev->ev_action = internal_accept;
1851 isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1855 dispatch_connect(isc_socket_t *sock) {
1857 isc_socket_connev_t *ev;
1859 iev = &sock->writable_ev;
1861 ev = sock->connect_ev;
1862 INSIST(ev != NULL); /* XXX */
1864 INSIST(sock->connecting);
1866 sock->references++; /* keep socket around for this internal event */
1867 iev->ev_sender = sock;
1868 iev->ev_action = internal_connect;
1871 isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1875 * Dequeue an item off the given socket's read queue, set the result code
1876 * in the done event to the one provided, and send it to the task it was
1879 * If the event to be sent is on a list, remove it before sending. If
1880 * asked to, send and detach from the socket as well.
1882 * Caller must have the socket locked if the event is attached to the socket.
1885 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1888 task = (*dev)->ev_sender;
1890 (*dev)->ev_sender = sock;
1892 if (ISC_LINK_LINKED(*dev, ev_link))
1893 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1895 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1896 == ISC_SOCKEVENTATTR_ATTACHED)
1897 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1899 isc_task_send(task, (isc_event_t **)dev);
1903 * See comments for send_recvdone_event() above.
1905 * Caller must have the socket locked if the event is attached to the socket.
1908 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1911 INSIST(dev != NULL && *dev != NULL);
1913 task = (*dev)->ev_sender;
1914 (*dev)->ev_sender = sock;
1916 if (ISC_LINK_LINKED(*dev, ev_link))
1917 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1919 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1920 == ISC_SOCKEVENTATTR_ATTACHED)
1921 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1923 isc_task_send(task, (isc_event_t **)dev);
1927 * Call accept() on a socket, to get the new file descriptor. The listen
1928 * socket is used as a prototype to create a new isc_socket_t. The new
1929 * socket has one outstanding reference. The task receiving the event
1930 * will be detached from just after the event is delivered.
1932 * On entry to this function, the event delivered is the internal
1933 * readable event, and the first item on the accept_list should be
1934 * the done event we want to send. If the list is empty, this is a no-op,
1935 * so just unlock and return.
1938 internal_accept(isc_task_t *me, isc_event_t *ev) {
1940 isc_socketmgr_t *manager;
1941 isc_socket_newconnev_t *dev;
1943 ISC_SOCKADDR_LEN_T addrlen;
1945 isc_result_t result = ISC_R_SUCCESS;
1946 char strbuf[ISC_STRERRORSIZE];
1947 const char *err = "accept";
1951 sock = ev->ev_sender;
1952 INSIST(VALID_SOCKET(sock));
1955 socket_log(sock, NULL, TRACE,
1956 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1957 "internal_accept called, locked socket");
1959 manager = sock->manager;
1960 INSIST(VALID_MANAGER(manager));
1962 INSIST(sock->listener);
1963 INSIST(sock->pending_accept == 1);
1964 sock->pending_accept = 0;
1966 INSIST(sock->references > 0);
1967 sock->references--; /* the internal event is done with this socket */
1968 if (sock->references == 0) {
1969 UNLOCK(&sock->lock);
1975 * Get the first item off the accept list.
1976 * If it is empty, unlock the socket and return.
1978 dev = ISC_LIST_HEAD(sock->accept_list);
1980 UNLOCK(&sock->lock);
1985 * Try to accept the new connection. If the accept fails with
1986 * EAGAIN or EINTR, simply poke the watcher to watch this socket
1987 * again. Also ignore ECONNRESET, which has been reported to
1988 * be spuriously returned on Linux 2.2.19 although it is not
1989 * a documented error for accept(). ECONNABORTED has been
1990 * reported for Solaris 8. The rest are thrown in not because
1991 * we have seen them but because they are ignored by other
1992 * deamons such as BIND 8 and Apache.
1995 addrlen = sizeof(dev->newsocket->address.type);
1996 memset(&dev->newsocket->address.type.sa, 0, addrlen);
1997 fd = accept(sock->fd, &dev->newsocket->address.type.sa,
2002 * Leave a space for stdio to work in.
2004 if (fd >= 0 && fd < 20) {
2006 new = fcntl(fd, F_DUPFD, 20);
2011 err = "accept/fcntl";
2016 if (SOFT_ERROR(errno))
2021 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2022 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2023 isc_msgcat, ISC_MSGSET_SOCKET,
2025 "%s: too many open file descriptors",
2048 isc__strerror(errno, strbuf, sizeof(strbuf));
2049 UNEXPECTED_ERROR(__FILE__, __LINE__,
2050 "internal_accept: %s() %s: %s", err,
2051 isc_msgcat_get(isc_msgcat,
2057 result = ISC_R_UNEXPECTED;
2059 if (addrlen == 0U) {
2060 UNEXPECTED_ERROR(__FILE__, __LINE__,
2061 "internal_accept(): "
2062 "accept() failed to return "
2067 } else if (dev->newsocket->address.type.sa.sa_family !=
2070 UNEXPECTED_ERROR(__FILE__, __LINE__,
2071 "internal_accept(): "
2072 "accept() returned peer address "
2073 "family %u (expected %u)",
2074 dev->newsocket->address.
2079 } else if (fd >= (int)manager->fdsize) {
2080 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2081 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2082 isc_msgcat, ISC_MSGSET_SOCKET,
2084 "%s: too many open file descriptors",
2092 dev->newsocket->address.length = addrlen;
2093 dev->newsocket->pf = sock->pf;
2097 * Pull off the done event.
2099 ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2102 * Poke watcher if there are more pending accepts.
2104 if (!ISC_LIST_EMPTY(sock->accept_list))
2105 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2107 UNLOCK(&sock->lock);
2109 if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) {
2112 result = ISC_R_UNEXPECTED;
2116 * -1 means the new socket didn't happen.
2119 LOCK(&manager->lock);
2120 ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
2122 dev->newsocket->fd = fd;
2123 dev->newsocket->bound = 1;
2124 dev->newsocket->connected = 1;
2127 * Save away the remote address
2129 dev->address = dev->newsocket->address;
2131 manager->fds[fd] = dev->newsocket;
2132 manager->fdstate[fd] = MANAGED;
2133 if (manager->maxfd < fd)
2134 manager->maxfd = fd;
2136 socket_log(sock, &dev->newsocket->address, CREATION,
2137 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2138 "accepted connection, new socket %p",
2141 UNLOCK(&manager->lock);
2143 dev->newsocket->references--;
2144 free_socket(&dev->newsocket);
2148 * Fill in the done event details and send it off.
2150 dev->result = result;
2151 task = dev->ev_sender;
2152 dev->ev_sender = sock;
2154 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2158 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2159 UNLOCK(&sock->lock);
2164 internal_recv(isc_task_t *me, isc_event_t *ev) {
2165 isc_socketevent_t *dev;
2168 INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
2170 sock = ev->ev_sender;
2171 INSIST(VALID_SOCKET(sock));
2174 socket_log(sock, NULL, IOEVENT,
2175 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2176 "internal_recv: task %p got event %p", me, ev);
2178 INSIST(sock->pending_recv == 1);
2179 sock->pending_recv = 0;
2181 INSIST(sock->references > 0);
2182 sock->references--; /* the internal event is done with this socket */
2183 if (sock->references == 0) {
2184 UNLOCK(&sock->lock);
2190 * Try to do as much I/O as possible on this socket. There are no
2191 * limits here, currently.
2193 dev = ISC_LIST_HEAD(sock->recv_list);
2194 while (dev != NULL) {
2195 switch (doio_recv(sock, dev)) {
2201 * read of 0 means the remote end was closed.
2202 * Run through the event queue and dispatch all
2203 * the events with an EOF result code.
2206 dev->result = ISC_R_EOF;
2207 send_recvdone_event(sock, &dev);
2208 dev = ISC_LIST_HEAD(sock->recv_list);
2209 } while (dev != NULL);
2214 send_recvdone_event(sock, &dev);
2218 dev = ISC_LIST_HEAD(sock->recv_list);
2222 if (!ISC_LIST_EMPTY(sock->recv_list))
2223 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2225 UNLOCK(&sock->lock);
2229 internal_send(isc_task_t *me, isc_event_t *ev) {
2230 isc_socketevent_t *dev;
2233 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2236 * Find out what socket this is and lock it.
2238 sock = (isc_socket_t *)ev->ev_sender;
2239 INSIST(VALID_SOCKET(sock));
2242 socket_log(sock, NULL, IOEVENT,
2243 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2244 "internal_send: task %p got event %p", me, ev);
2246 INSIST(sock->pending_send == 1);
2247 sock->pending_send = 0;
2249 INSIST(sock->references > 0);
2250 sock->references--; /* the internal event is done with this socket */
2251 if (sock->references == 0) {
2252 UNLOCK(&sock->lock);
2258 * Try to do as much I/O as possible on this socket. There are no
2259 * limits here, currently.
2261 dev = ISC_LIST_HEAD(sock->send_list);
2262 while (dev != NULL) {
2263 switch (doio_send(sock, dev)) {
2269 send_senddone_event(sock, &dev);
2273 dev = ISC_LIST_HEAD(sock->send_list);
2277 if (!ISC_LIST_EMPTY(sock->send_list))
2278 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2280 UNLOCK(&sock->lock);
2284 process_fds(isc_socketmgr_t *manager, int maxfd,
2285 fd_set *readfds, fd_set *writefds)
2289 isc_boolean_t unlock_sock;
2291 REQUIRE(maxfd <= (int)manager->fdsize);
2294 * Process read/writes on other fds here. Avoid locking
2295 * and unlocking twice if both reads and writes are possible.
2297 for (i = 0; i < maxfd; i++) {
2298 #ifdef ISC_PLATFORM_USETHREADS
2299 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
2301 #endif /* ISC_PLATFORM_USETHREADS */
2303 if (manager->fdstate[i] == CLOSE_PENDING) {
2304 manager->fdstate[i] = CLOSED;
2305 FD_CLR(i, manager->read_fds);
2306 FD_CLR(i, manager->write_fds);
2313 sock = manager->fds[i];
2314 unlock_sock = ISC_FALSE;
2315 if (FD_ISSET(i, readfds)) {
2317 FD_CLR(i, manager->read_fds);
2320 unlock_sock = ISC_TRUE;
2322 if (!SOCK_DEAD(sock)) {
2324 dispatch_accept(sock);
2326 dispatch_recv(sock);
2328 FD_CLR(i, manager->read_fds);
2331 if (FD_ISSET(i, writefds)) {
2333 FD_CLR(i, manager->write_fds);
2337 unlock_sock = ISC_TRUE;
2340 if (!SOCK_DEAD(sock)) {
2341 if (sock->connecting)
2342 dispatch_connect(sock);
2344 dispatch_send(sock);
2346 FD_CLR(i, manager->write_fds);
2349 UNLOCK(&sock->lock);
2353 #ifdef ISC_PLATFORM_USETHREADS
2355 * This is the thread that will loop forever, always in a select or poll
2358 * When select returns something to do, track down what thread gets to do
2359 * this I/O and post the event to it.
2361 static isc_threadresult_t
2362 watcher(void *uap) {
2363 isc_socketmgr_t *manager = uap;
2369 char strbuf[ISC_STRERRORSIZE];
2372 * Get the control fd here. This will never change.
2374 LOCK(&manager->lock);
2375 ctlfd = manager->pipe_fds[0];
2380 memcpy(manager->read_fds_copy, manager->read_fds,
2381 manager->fd_bufsize);
2382 memcpy(manager->write_fds_copy, manager->write_fds,
2383 manager->fd_bufsize);
2384 maxfd = manager->maxfd + 1;
2386 UNLOCK(&manager->lock);
2388 cc = select(maxfd, manager->read_fds_copy,
2389 manager->write_fds_copy, NULL, NULL);
2391 if (!SOFT_ERROR(errno)) {
2392 isc__strerror(errno, strbuf,
2394 FATAL_ERROR(__FILE__, __LINE__,
2396 isc_msgcat_get(isc_msgcat,
2404 LOCK(&manager->lock);
2409 * Process reads on internal, control fd.
2411 if (FD_ISSET(ctlfd, manager->read_fds_copy)) {
2413 select_readmsg(manager, &fd, &msg);
2415 manager_log(manager, IOEVENT,
2416 isc_msgcat_get(isc_msgcat,
2419 "watcher got message %d"),
2425 if (msg == SELECT_POKE_NOTHING)
2429 * Handle shutdown message. We really should
2430 * jump out of this loop right away, but
2431 * it doesn't matter if we have to do a little
2434 if (msg == SELECT_POKE_SHUTDOWN) {
2441 * This is a wakeup on a socket. Look
2442 * at the event queue for both read and write,
2443 * and decide if we need to watch on it now
2446 wakeup_socket(manager, fd, msg);
2450 process_fds(manager, maxfd, manager->read_fds_copy,
2451 manager->write_fds_copy);
2454 manager_log(manager, TRACE,
2455 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2456 ISC_MSG_EXITING, "watcher exiting"));
2458 UNLOCK(&manager->lock);
2459 return ((isc_threadresult_t)0);
2461 #endif /* ISC_PLATFORM_USETHREADS */
2464 isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
2466 REQUIRE(VALID_MANAGER(manager));
2468 manager->reserved = reserved;
2472 * Initialize fdsets in socketmgr structure.
2475 create_fdsets(isc_socketmgr_t *manager, isc_mem_t *mctx) {
2476 #if ISC_SOCKET_FDSETSIZE > FD_SETSIZE
2477 manager->fdsize = ISC_SOCKET_FDSETSIZE;
2478 manager->fd_bufsize = howmany(ISC_SOCKET_FDSETSIZE, NFDBITS) *
2481 manager->fdsize = FD_SETSIZE;
2482 manager->fd_bufsize = sizeof(fd_set);
2485 manager->fds = NULL;
2486 manager->fdstate = NULL;
2487 manager->read_fds = NULL;
2488 manager->read_fds_copy = NULL;
2489 manager->write_fds = NULL;
2490 manager->write_fds_copy = NULL;
2492 manager->fds = isc_mem_get(mctx,
2493 manager->fdsize * sizeof(manager->fds[0]));
2494 if (manager->fds == NULL)
2497 manager->fdstate = isc_mem_get(mctx, manager->fdsize *
2498 sizeof(manager->fdstate[0]));
2499 if (manager->fdstate == NULL)
2502 manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
2503 if (manager->read_fds == NULL)
2505 manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
2506 if (manager->read_fds_copy == NULL)
2508 manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
2509 if (manager->write_fds == NULL)
2511 manager->write_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
2512 if (manager->write_fds_copy == NULL)
2515 return (ISC_R_SUCCESS);
2518 cleanup_fdsets(manager, mctx);
2519 return (ISC_R_NOMEMORY);
2523 * Clean up fdsets in socketmgr structure.
2526 cleanup_fdsets(isc_socketmgr_t *manager, isc_mem_t *mctx) {
2527 if (manager->fds != NULL) {
2528 isc_mem_put(mctx, manager->fds,
2529 manager->fdsize * sizeof(manager->fds[0]));
2531 if (manager->fdstate != NULL) {
2532 isc_mem_put(mctx, manager->fdstate,
2533 manager->fdsize * sizeof(manager->fdstate[0]));
2535 if (manager->read_fds != NULL)
2536 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
2537 if (manager->read_fds_copy != NULL)
2538 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize);
2539 if (manager->write_fds != NULL)
2540 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
2541 if (manager->write_fds_copy != NULL)
2542 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
2546 * Create a new socket manager.
2549 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2550 isc_socketmgr_t *manager;
2551 #ifdef ISC_PLATFORM_USETHREADS
2552 char strbuf[ISC_STRERRORSIZE];
2554 isc_result_t result;
2556 REQUIRE(managerp != NULL && *managerp == NULL);
2558 #ifndef ISC_PLATFORM_USETHREADS
2559 if (socketmgr != NULL) {
2561 *managerp = socketmgr;
2562 return (ISC_R_SUCCESS);
2564 #endif /* ISC_PLATFORM_USETHREADS */
2566 manager = isc_mem_get(mctx, sizeof(*manager));
2567 if (manager == NULL)
2568 return (ISC_R_NOMEMORY);
2570 result = create_fdsets(manager, mctx);
2571 if (result != ISC_R_SUCCESS) {
2572 cleanup_fdsets(manager, mctx);
2573 isc_mem_put(mctx, manager, sizeof(*manager));
2577 manager->magic = SOCKET_MANAGER_MAGIC;
2578 manager->mctx = NULL;
2579 memset(manager->fds, 0, sizeof(manager->fds[0]) * manager->fdsize);
2580 ISC_LIST_INIT(manager->socklist);
2581 result = isc_mutex_init(&manager->lock);
2582 if (result != ISC_R_SUCCESS) {
2583 cleanup_fdsets(manager, mctx);
2584 isc_mem_put(mctx, manager, sizeof(*manager));
2587 #ifdef ISC_PLATFORM_USETHREADS
2588 if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2589 cleanup_fdsets(manager, mctx);
2590 DESTROYLOCK(&manager->lock);
2591 isc_mem_put(mctx, manager, sizeof(*manager));
2592 UNEXPECTED_ERROR(__FILE__, __LINE__,
2593 "isc_condition_init() %s",
2594 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2595 ISC_MSG_FAILED, "failed"));
2596 return (ISC_R_UNEXPECTED);
2600 * Create the special fds that will be used to wake up the
2601 * select/poll loop when something internal needs to be done.
2603 if (pipe(manager->pipe_fds) != 0) {
2604 cleanup_fdsets(manager, mctx);
2605 DESTROYLOCK(&manager->lock);
2606 isc_mem_put(mctx, manager, sizeof(*manager));
2607 isc__strerror(errno, strbuf, sizeof(strbuf));
2608 UNEXPECTED_ERROR(__FILE__, __LINE__,
2610 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2611 ISC_MSG_FAILED, "failed"),
2614 return (ISC_R_UNEXPECTED);
2617 RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
2619 RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
2621 #else /* ISC_PLATFORM_USETHREADS */
2623 #endif /* ISC_PLATFORM_USETHREADS */
2626 * Set up initial state for the select loop
2628 memset(manager->read_fds, 0, manager->fd_bufsize);
2629 memset(manager->write_fds, 0, manager->fd_bufsize);
2630 #ifdef ISC_PLATFORM_USETHREADS
2631 FD_SET(manager->pipe_fds[0], manager->read_fds);
2632 manager->maxfd = manager->pipe_fds[0];
2633 #else /* ISC_PLATFORM_USETHREADS */
2635 #endif /* ISC_PLATFORM_USETHREADS */
2636 manager->reserved = 0;
2637 memset(manager->fdstate, 0,
2638 manager->fdsize * sizeof(manager->fdstate[0]));
2640 #ifdef ISC_PLATFORM_USETHREADS
2642 * Start up the select/poll thread.
2644 if (isc_thread_create(watcher, manager, &manager->watcher) !=
2646 (void)close(manager->pipe_fds[0]);
2647 (void)close(manager->pipe_fds[1]);
2648 DESTROYLOCK(&manager->lock);
2649 isc_mem_put(mctx, manager, sizeof(*manager));
2650 UNEXPECTED_ERROR(__FILE__, __LINE__,
2651 "isc_thread_create() %s",
2652 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2653 ISC_MSG_FAILED, "failed"));
2654 return (ISC_R_UNEXPECTED);
2656 #endif /* ISC_PLATFORM_USETHREADS */
2657 isc_mem_attach(mctx, &manager->mctx);
2659 #ifndef ISC_PLATFORM_USETHREADS
2660 socketmgr = manager;
2661 #endif /* ISC_PLATFORM_USETHREADS */
2662 *managerp = manager;
2664 return (ISC_R_SUCCESS);
2668 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
2669 isc_socketmgr_t *manager;
2674 * Destroy a socket manager.
2677 REQUIRE(managerp != NULL);
2678 manager = *managerp;
2679 REQUIRE(VALID_MANAGER(manager));
2681 #ifndef ISC_PLATFORM_USETHREADS
2682 if (manager->refs > 1) {
2687 #endif /* ISC_PLATFORM_USETHREADS */
2689 LOCK(&manager->lock);
2691 #ifdef ISC_PLATFORM_USETHREADS
2693 * Wait for all sockets to be destroyed.
2695 while (!ISC_LIST_EMPTY(manager->socklist)) {
2696 manager_log(manager, CREATION,
2697 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2698 ISC_MSG_SOCKETSREMAIN,
2700 WAIT(&manager->shutdown_ok, &manager->lock);
2702 #else /* ISC_PLATFORM_USETHREADS */
2704 * Hope all sockets have been destroyed.
2706 if (!ISC_LIST_EMPTY(manager->socklist)) {
2707 manager_log(manager, CREATION,
2708 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2709 ISC_MSG_SOCKETSREMAIN,
2713 #endif /* ISC_PLATFORM_USETHREADS */
2715 UNLOCK(&manager->lock);
2718 * Here, poke our select/poll thread. Do this by closing the write
2719 * half of the pipe, which will send EOF to the read half.
2720 * This is currently a no-op in the non-threaded case.
2722 select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
2724 #ifdef ISC_PLATFORM_USETHREADS
2726 * Wait for thread to exit.
2728 if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
2729 UNEXPECTED_ERROR(__FILE__, __LINE__,
2730 "isc_thread_join() %s",
2731 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2732 ISC_MSG_FAILED, "failed"));
2733 #endif /* ISC_PLATFORM_USETHREADS */
2738 #ifdef ISC_PLATFORM_USETHREADS
2739 (void)close(manager->pipe_fds[0]);
2740 (void)close(manager->pipe_fds[1]);
2741 (void)isc_condition_destroy(&manager->shutdown_ok);
2742 #endif /* ISC_PLATFORM_USETHREADS */
2744 for (i = 0; i < (int)manager->fdsize; i++)
2745 if (manager->fdstate[i] == CLOSE_PENDING)
2748 DESTROYLOCK(&manager->lock);
2749 cleanup_fdsets(manager, manager->mctx);
2751 mctx= manager->mctx;
2752 isc_mem_put(mctx, manager, sizeof(*manager));
2754 isc_mem_detach(&mctx);
2760 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2764 isc_boolean_t have_lock = ISC_FALSE;
2765 isc_task_t *ntask = NULL;
2766 isc_result_t result = ISC_R_SUCCESS;
2768 dev->ev_sender = task;
2770 if (sock->type == isc_sockettype_udp) {
2771 io_state = doio_recv(sock, dev);
2774 have_lock = ISC_TRUE;
2776 if (ISC_LIST_EMPTY(sock->recv_list))
2777 io_state = doio_recv(sock, dev);
2779 io_state = DOIO_SOFT;
2785 * We couldn't read all or part of the request right now, so
2788 * Attach to socket and to task
2790 isc_task_attach(task, &ntask);
2791 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2795 have_lock = ISC_TRUE;
2799 * Enqueue the request. If the socket was previously not being
2800 * watched, poke the watcher to start paying attention to it.
2802 if (ISC_LIST_EMPTY(sock->recv_list))
2803 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2804 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2806 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2807 "socket_recv: event %p -> task %p",
2810 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2811 result = ISC_R_INPROGRESS;
2815 dev->result = ISC_R_EOF;
2820 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2821 send_recvdone_event(sock, &dev);
2826 UNLOCK(&sock->lock);
2832 isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2833 unsigned int minimum, isc_task_t *task,
2834 isc_taskaction_t action, const void *arg)
2836 isc_socketevent_t *dev;
2837 isc_socketmgr_t *manager;
2838 unsigned int iocount;
2839 isc_buffer_t *buffer;
2841 REQUIRE(VALID_SOCKET(sock));
2842 REQUIRE(buflist != NULL);
2843 REQUIRE(!ISC_LIST_EMPTY(*buflist));
2844 REQUIRE(task != NULL);
2845 REQUIRE(action != NULL);
2847 manager = sock->manager;
2848 REQUIRE(VALID_MANAGER(manager));
2850 iocount = isc_bufferlist_availablecount(buflist);
2851 REQUIRE(iocount > 0);
2853 INSIST(sock->bound);
2855 dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2857 return (ISC_R_NOMEMORY);
2861 * UDP sockets are always partial read
2863 if (sock->type == isc_sockettype_udp)
2867 dev->minimum = iocount;
2869 dev->minimum = minimum;
2873 * Move each buffer from the passed in list to our internal one.
2875 buffer = ISC_LIST_HEAD(*buflist);
2876 while (buffer != NULL) {
2877 ISC_LIST_DEQUEUE(*buflist, buffer, link);
2878 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2879 buffer = ISC_LIST_HEAD(*buflist);
2882 return (socket_recv(sock, dev, task, 0));
2886 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
2887 isc_task_t *task, isc_taskaction_t action, const void *arg)
2889 isc_socketevent_t *dev;
2890 isc_socketmgr_t *manager;
2892 REQUIRE(VALID_SOCKET(sock));
2893 REQUIRE(action != NULL);
2895 manager = sock->manager;
2896 REQUIRE(VALID_MANAGER(manager));
2898 INSIST(sock->bound);
2900 dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2902 return (ISC_R_NOMEMORY);
2904 return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
2908 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
2909 unsigned int minimum, isc_task_t *task,
2910 isc_socketevent_t *event, unsigned int flags)
2912 event->ev_sender = sock;
2913 event->result = ISC_R_UNEXPECTED;
2914 ISC_LIST_INIT(event->bufferlist);
2915 event->region = *region;
2918 event->attributes = 0;
2921 * UDP sockets are always partial read.
2923 if (sock->type == isc_sockettype_udp)
2927 event->minimum = region->length;
2929 event->minimum = minimum;
2932 return (socket_recv(sock, event, task, flags));
2936 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2937 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2941 isc_boolean_t have_lock = ISC_FALSE;
2942 isc_task_t *ntask = NULL;
2943 isc_result_t result = ISC_R_SUCCESS;
2945 dev->ev_sender = task;
2947 set_dev_address(address, sock, dev);
2948 if (pktinfo != NULL) {
2949 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2950 dev->pktinfo = *pktinfo;
2952 if (!isc_sockaddr_issitelocal(&dev->address) &&
2953 !isc_sockaddr_islinklocal(&dev->address)) {
2954 socket_log(sock, NULL, TRACE, isc_msgcat,
2955 ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
2956 "pktinfo structure provided, ifindex %u "
2957 "(set to 0)", pktinfo->ipi6_ifindex);
2960 * Set the pktinfo index to 0 here, to let the
2961 * kernel decide what interface it should send on.
2963 dev->pktinfo.ipi6_ifindex = 0;
2967 if (sock->type == isc_sockettype_udp)
2968 io_state = doio_send(sock, dev);
2971 have_lock = ISC_TRUE;
2973 if (ISC_LIST_EMPTY(sock->send_list))
2974 io_state = doio_send(sock, dev);
2976 io_state = DOIO_SOFT;
2982 * We couldn't send all or part of the request right now, so
2983 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2985 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2986 isc_task_attach(task, &ntask);
2987 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2991 have_lock = ISC_TRUE;
2995 * Enqueue the request. If the socket was previously
2996 * not being watched, poke the watcher to start
2997 * paying attention to it.
2999 if (ISC_LIST_EMPTY(sock->send_list))
3000 select_poke(sock->manager, sock->fd,
3002 ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
3004 socket_log(sock, NULL, EVENT, NULL, 0, 0,
3005 "socket_send: event %p -> task %p",
3008 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
3009 result = ISC_R_INPROGRESS;
3015 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
3016 send_senddone_event(sock, &dev);
3021 UNLOCK(&sock->lock);
3027 isc_socket_send(isc_socket_t *sock, isc_region_t *region,
3028 isc_task_t *task, isc_taskaction_t action, const void *arg)
3031 * REQUIRE() checking is performed in isc_socket_sendto().
3033 return (isc_socket_sendto(sock, region, task, action, arg, NULL,
3038 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
3039 isc_task_t *task, isc_taskaction_t action, const void *arg,
3040 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3042 isc_socketevent_t *dev;
3043 isc_socketmgr_t *manager;
3045 REQUIRE(VALID_SOCKET(sock));
3046 REQUIRE(region != NULL);
3047 REQUIRE(task != NULL);
3048 REQUIRE(action != NULL);
3050 manager = sock->manager;
3051 REQUIRE(VALID_MANAGER(manager));
3053 INSIST(sock->bound);
3055 dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3057 return (ISC_R_NOMEMORY);
3060 dev->region = *region;
3062 return (socket_send(sock, dev, task, address, pktinfo, 0));
3066 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
3067 isc_task_t *task, isc_taskaction_t action, const void *arg)
3069 return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
3074 isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
3075 isc_task_t *task, isc_taskaction_t action, const void *arg,
3076 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3078 isc_socketevent_t *dev;
3079 isc_socketmgr_t *manager;
3080 unsigned int iocount;
3081 isc_buffer_t *buffer;
3083 REQUIRE(VALID_SOCKET(sock));
3084 REQUIRE(buflist != NULL);
3085 REQUIRE(!ISC_LIST_EMPTY(*buflist));
3086 REQUIRE(task != NULL);
3087 REQUIRE(action != NULL);
3089 manager = sock->manager;
3090 REQUIRE(VALID_MANAGER(manager));
3092 iocount = isc_bufferlist_usedcount(buflist);
3093 REQUIRE(iocount > 0);
3095 dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3097 return (ISC_R_NOMEMORY);
3101 * Move each buffer from the passed in list to our internal one.
3103 buffer = ISC_LIST_HEAD(*buflist);
3104 while (buffer != NULL) {
3105 ISC_LIST_DEQUEUE(*buflist, buffer, link);
3106 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
3107 buffer = ISC_LIST_HEAD(*buflist);
3110 return (socket_send(sock, dev, task, address, pktinfo, 0));
3114 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
3116 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
3117 isc_socketevent_t *event, unsigned int flags)
3119 REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
3120 if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
3121 REQUIRE(sock->type == isc_sockettype_udp);
3122 event->ev_sender = sock;
3123 event->result = ISC_R_UNEXPECTED;
3124 ISC_LIST_INIT(event->bufferlist);
3125 event->region = *region;
3128 event->attributes = 0;
3130 return (socket_send(sock, event, task, address, pktinfo, flags));
3134 isc_socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
3135 #ifdef ISC_PLATFORM_HAVESYSUNH
3138 char strbuf[ISC_STRERRORSIZE];
3140 if (sockaddr->type.sa.sa_family != AF_UNIX)
3144 #if defined(S_IFMT) && defined(S_IFSOCK)
3145 #define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
3146 #elif defined(_S_IFMT) && defined(S_IFSOCK)
3147 #define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
3152 #if defined(S_IFMT) && defined(S_IFIFO)
3153 #define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
3154 #elif defined(_S_IFMT) && defined(S_IFIFO)
3155 #define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
3159 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
3160 #error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform. See <sys/stat.h>.
3164 #define S_ISFIFO(mode) 0
3168 #define S_ISSOCK(mode) 0
3172 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
3173 isc__strerror(errno, strbuf, sizeof(strbuf));
3174 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3175 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3176 "isc_socket_cleanunix: stat(%s): %s",
3177 sockaddr->type.sunix.sun_path, strbuf);
3180 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
3181 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3182 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3183 "isc_socket_cleanunix: %s: not a socket",
3184 sockaddr->type.sunix.sun_path);
3187 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
3188 isc__strerror(errno, strbuf, sizeof(strbuf));
3189 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3190 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3191 "isc_socket_cleanunix: unlink(%s): %s",
3192 sockaddr->type.sunix.sun_path, strbuf);
3197 s = socket(AF_UNIX, SOCK_STREAM, 0);
3199 isc__strerror(errno, strbuf, sizeof(strbuf));
3200 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3201 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
3202 "isc_socket_cleanunix: socket(%s): %s",
3203 sockaddr->type.sunix.sun_path, strbuf);
3207 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
3209 case ENOENT: /* We exited cleanly last time */
3212 isc__strerror(errno, strbuf, sizeof(strbuf));
3213 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3214 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
3215 "isc_socket_cleanunix: stat(%s): %s",
3216 sockaddr->type.sunix.sun_path, strbuf);
3222 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
3223 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3224 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
3225 "isc_socket_cleanunix: %s: not a socket",
3226 sockaddr->type.sunix.sun_path);
3230 if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
3231 sizeof(sockaddr->type.sunix)) < 0) {
3235 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
3236 isc__strerror(errno, strbuf, sizeof(strbuf));
3237 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3238 ISC_LOGMODULE_SOCKET,
3240 "isc_socket_cleanunix: "
3242 sockaddr->type.sunix.sun_path,
3247 isc__strerror(errno, strbuf, sizeof(strbuf));
3248 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3249 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
3250 "isc_socket_cleanunix: connect(%s): %s",
3251 sockaddr->type.sunix.sun_path, strbuf);
3264 isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
3265 isc_uint32_t owner, isc_uint32_t group)
3267 #ifdef ISC_PLATFORM_HAVESYSUNH
3268 isc_result_t result = ISC_R_SUCCESS;
3269 char strbuf[ISC_STRERRORSIZE];
3270 char path[sizeof(sockaddr->type.sunix.sun_path)];
3271 #ifdef NEED_SECURE_DIRECTORY
3275 REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
3276 INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
3277 strcpy(path, sockaddr->type.sunix.sun_path);
3279 #ifdef NEED_SECURE_DIRECTORY
3280 slash = strrchr(path, '/');
3281 if (slash != NULL) {
3290 if (chmod(path, perm) < 0) {
3291 isc__strerror(errno, strbuf, sizeof(strbuf));
3292 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3293 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3294 "isc_socket_permunix: chmod(%s, %d): %s",
3295 path, perm, strbuf);
3296 result = ISC_R_FAILURE;
3298 if (chown(path, owner, group) < 0) {
3299 isc__strerror(errno, strbuf, sizeof(strbuf));
3300 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3301 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3302 "isc_socket_permunix: chown(%s, %d, %d): %s",
3305 result = ISC_R_FAILURE;
3313 return (ISC_R_NOTIMPLEMENTED);
3318 isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
3319 unsigned int options) {
3320 char strbuf[ISC_STRERRORSIZE];
3325 INSIST(!sock->bound);
3327 if (sock->pf != sockaddr->type.sa.sa_family) {
3328 UNLOCK(&sock->lock);
3329 return (ISC_R_FAMILYMISMATCH);
3332 * Only set SO_REUSEADDR when we want a specific port.
3335 if (sock->pf == AF_UNIX)
3338 if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
3339 isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
3340 setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
3342 UNEXPECTED_ERROR(__FILE__, __LINE__,
3343 "setsockopt(%d) %s", sock->fd,
3344 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3345 ISC_MSG_FAILED, "failed"));
3351 if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
3352 UNLOCK(&sock->lock);
3355 return (ISC_R_NOPERM);
3357 return (ISC_R_ADDRNOTAVAIL);
3359 return (ISC_R_ADDRINUSE);
3361 return (ISC_R_BOUND);
3363 isc__strerror(errno, strbuf, sizeof(strbuf));
3364 UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
3366 return (ISC_R_UNEXPECTED);
3370 socket_log(sock, sockaddr, TRACE,
3371 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
3374 UNLOCK(&sock->lock);
3375 return (ISC_R_SUCCESS);
3379 isc_socket_filter(isc_socket_t *sock, const char *filter) {
3380 #ifdef SO_ACCEPTFILTER
3381 char strbuf[ISC_STRERRORSIZE];
3382 struct accept_filter_arg afa;
3388 REQUIRE(VALID_SOCKET(sock));
3390 #ifdef SO_ACCEPTFILTER
3391 bzero(&afa, sizeof(afa));
3392 strncpy(afa.af_name, filter, sizeof(afa.af_name));
3393 if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
3394 &afa, sizeof(afa)) == -1) {
3395 isc__strerror(errno, strbuf, sizeof(strbuf));
3396 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
3397 ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
3399 return (ISC_R_FAILURE);
3401 return (ISC_R_SUCCESS);
3403 return (ISC_R_NOTIMPLEMENTED);
3408 * Set up to listen on a given socket. We do this by creating an internal
3409 * event that will be dispatched when the socket has read activity. The
3410 * watcher will send the internal event to the task when there is a new
3413 * Unlike in read, we don't preallocate a done event here. Every time there
3414 * is a new connection we'll have to allocate a new one anyway, so we might
3415 * as well keep things simple rather than having to track them.
3418 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
3419 char strbuf[ISC_STRERRORSIZE];
3421 REQUIRE(VALID_SOCKET(sock));
3425 REQUIRE(!sock->listener);
3426 REQUIRE(sock->bound);
3427 REQUIRE(sock->type == isc_sockettype_tcp ||
3428 sock->type == isc_sockettype_unix);
3431 backlog = SOMAXCONN;
3433 if (listen(sock->fd, (int)backlog) < 0) {
3434 UNLOCK(&sock->lock);
3435 isc__strerror(errno, strbuf, sizeof(strbuf));
3437 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
3439 return (ISC_R_UNEXPECTED);
3444 UNLOCK(&sock->lock);
3445 return (ISC_R_SUCCESS);
3449 * This should try to do agressive accept() XXXMLG
3452 isc_socket_accept(isc_socket_t *sock,
3453 isc_task_t *task, isc_taskaction_t action, const void *arg)
3455 isc_socket_newconnev_t *dev;
3456 isc_socketmgr_t *manager;
3457 isc_task_t *ntask = NULL;
3458 isc_socket_t *nsock;
3459 isc_result_t result;
3460 isc_boolean_t do_poke = ISC_FALSE;
3462 REQUIRE(VALID_SOCKET(sock));
3463 manager = sock->manager;
3464 REQUIRE(VALID_MANAGER(manager));
3468 REQUIRE(sock->listener);
3471 * Sender field is overloaded here with the task we will be sending
3472 * this event to. Just before the actual event is delivered the
3473 * actual ev_sender will be touched up to be the socket.
3475 dev = (isc_socket_newconnev_t *)
3476 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
3477 action, arg, sizeof(*dev));
3479 UNLOCK(&sock->lock);
3480 return (ISC_R_NOMEMORY);
3482 ISC_LINK_INIT(dev, ev_link);
3484 result = allocate_socket(manager, sock->type, &nsock);
3485 if (result != ISC_R_SUCCESS) {
3486 isc_event_free(ISC_EVENT_PTR(&dev));
3487 UNLOCK(&sock->lock);
3492 * Attach to socket and to task.
3494 isc_task_attach(task, &ntask);
3495 nsock->references++;
3497 dev->ev_sender = ntask;
3498 dev->newsocket = nsock;
3501 * Poke watcher here. We still have the socket locked, so there
3502 * is no race condition. We will keep the lock for such a short
3503 * bit of time waking it up now or later won't matter all that much.
3505 if (ISC_LIST_EMPTY(sock->accept_list))
3508 ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
3511 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
3513 UNLOCK(&sock->lock);
3514 return (ISC_R_SUCCESS);
3518 isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
3519 isc_task_t *task, isc_taskaction_t action, const void *arg)
3521 isc_socket_connev_t *dev;
3522 isc_task_t *ntask = NULL;
3523 isc_socketmgr_t *manager;
3525 char strbuf[ISC_STRERRORSIZE];
3527 REQUIRE(VALID_SOCKET(sock));
3528 REQUIRE(addr != NULL);
3529 REQUIRE(task != NULL);
3530 REQUIRE(action != NULL);
3532 manager = sock->manager;
3533 REQUIRE(VALID_MANAGER(manager));
3534 REQUIRE(addr != NULL);
3536 if (isc_sockaddr_ismulticast(addr))
3537 return (ISC_R_MULTICAST);
3541 REQUIRE(!sock->connecting);
3543 dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
3544 ISC_SOCKEVENT_CONNECT,
3548 UNLOCK(&sock->lock);
3549 return (ISC_R_NOMEMORY);
3551 ISC_LINK_INIT(dev, ev_link);
3554 * Try to do the connect right away, as there can be only one
3555 * outstanding, and it might happen to complete.
3557 sock->address = *addr;
3558 cc = connect(sock->fd, &addr->type.sa, addr->length);
3560 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
3564 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
3565 ERROR_MATCH(EACCES, ISC_R_NOPERM);
3566 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
3567 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
3568 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
3569 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
3571 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
3573 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
3574 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
3575 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
3576 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
3577 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
3581 sock->connected = 0;
3583 isc__strerror(errno, strbuf, sizeof(strbuf));
3584 UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", errno, strbuf);
3586 UNLOCK(&sock->lock);
3587 isc_event_free(ISC_EVENT_PTR(&dev));
3588 return (ISC_R_UNEXPECTED);
3591 sock->connected = 0;
3592 isc_task_send(task, ISC_EVENT_PTR(&dev));
3594 UNLOCK(&sock->lock);
3595 return (ISC_R_SUCCESS);
3599 * If connect completed, fire off the done event.
3602 sock->connected = 1;
3604 dev->result = ISC_R_SUCCESS;
3605 isc_task_send(task, ISC_EVENT_PTR(&dev));
3607 UNLOCK(&sock->lock);
3608 return (ISC_R_SUCCESS);
3616 isc_task_attach(task, &ntask);
3618 sock->connecting = 1;
3620 dev->ev_sender = ntask;
3623 * Poke watcher here. We still have the socket locked, so there
3624 * is no race condition. We will keep the lock for such a short
3625 * bit of time waking it up now or later won't matter all that much.
3627 if (sock->connect_ev == NULL)
3628 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
3630 sock->connect_ev = dev;
3632 UNLOCK(&sock->lock);
3633 return (ISC_R_SUCCESS);
3637 * Called when a socket with a pending connect() finishes.
3640 internal_connect(isc_task_t *me, isc_event_t *ev) {
3642 isc_socket_connev_t *dev;
3645 ISC_SOCKADDR_LEN_T optlen;
3646 char strbuf[ISC_STRERRORSIZE];
3647 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
3650 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3652 sock = ev->ev_sender;
3653 INSIST(VALID_SOCKET(sock));
3658 * When the internal event was sent the reference count was bumped
3659 * to keep the socket around for us. Decrement the count here.
3661 INSIST(sock->references > 0);
3663 if (sock->references == 0) {
3664 UNLOCK(&sock->lock);
3670 * Has this event been canceled?
3672 dev = sock->connect_ev;
3674 INSIST(!sock->connecting);
3675 UNLOCK(&sock->lock);
3679 INSIST(sock->connecting);
3680 sock->connecting = 0;
3683 * Get any possible error status here.
3685 optlen = sizeof(cc);
3686 if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
3687 (void *)&cc, (void *)&optlen) < 0)
3694 * If the error is EAGAIN, just re-select on this
3695 * fd and pretend nothing strange happened.
3697 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
3698 sock->connecting = 1;
3699 select_poke(sock->manager, sock->fd,
3700 SELECT_POKE_CONNECT);
3701 UNLOCK(&sock->lock);
3707 * Translate other errors into ISC_R_* flavors.
3710 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
3711 ERROR_MATCH(EACCES, ISC_R_NOPERM);
3712 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
3713 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
3714 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
3715 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
3717 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
3719 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
3720 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
3721 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
3722 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
3723 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
3724 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
3727 dev->result = ISC_R_UNEXPECTED;
3728 isc_sockaddr_format(&sock->address, peerbuf,
3730 isc__strerror(errno, strbuf, sizeof(strbuf));
3731 UNEXPECTED_ERROR(__FILE__, __LINE__,
3732 "internal_connect: connect(%s) %s",
3736 dev->result = ISC_R_SUCCESS;
3737 sock->connected = 1;
3741 sock->connect_ev = NULL;
3743 UNLOCK(&sock->lock);
3745 task = dev->ev_sender;
3746 dev->ev_sender = sock;
3747 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
3751 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3752 isc_result_t result;
3754 REQUIRE(VALID_SOCKET(sock));
3755 REQUIRE(addressp != NULL);
3759 if (sock->connected) {
3760 *addressp = sock->address;
3761 result = ISC_R_SUCCESS;
3763 result = ISC_R_NOTCONNECTED;
3766 UNLOCK(&sock->lock);
3772 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3773 ISC_SOCKADDR_LEN_T len;
3774 isc_result_t result;
3775 char strbuf[ISC_STRERRORSIZE];
3777 REQUIRE(VALID_SOCKET(sock));
3778 REQUIRE(addressp != NULL);
3783 result = ISC_R_NOTBOUND;
3787 result = ISC_R_SUCCESS;
3789 len = sizeof(addressp->type);
3790 if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
3791 isc__strerror(errno, strbuf, sizeof(strbuf));
3792 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
3794 result = ISC_R_UNEXPECTED;
3797 addressp->length = (unsigned int)len;
3800 UNLOCK(&sock->lock);
3806 * Run through the list of events on this socket, and cancel the ones
3807 * queued for task "task" of type "how". "how" is a bitmask.
3810 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
3812 REQUIRE(VALID_SOCKET(sock));
3815 * Quick exit if there is nothing to do. Don't even bother locking
3824 * All of these do the same thing, more or less.
3826 * o If the internal event is marked as "posted" try to
3827 * remove it from the task's queue. If this fails, mark it
3828 * as canceled instead, and let the task clean it up later.
3829 * o For each I/O request for that task of that type, post
3830 * its done event with status of "ISC_R_CANCELED".
3831 * o Reset any state needed.
3833 if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
3834 && !ISC_LIST_EMPTY(sock->recv_list)) {
3835 isc_socketevent_t *dev;
3836 isc_socketevent_t *next;
3837 isc_task_t *current_task;
3839 dev = ISC_LIST_HEAD(sock->recv_list);
3841 while (dev != NULL) {
3842 current_task = dev->ev_sender;
3843 next = ISC_LIST_NEXT(dev, ev_link);
3845 if ((task == NULL) || (task == current_task)) {
3846 dev->result = ISC_R_CANCELED;
3847 send_recvdone_event(sock, &dev);
3853 if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
3854 && !ISC_LIST_EMPTY(sock->send_list)) {
3855 isc_socketevent_t *dev;
3856 isc_socketevent_t *next;
3857 isc_task_t *current_task;
3859 dev = ISC_LIST_HEAD(sock->send_list);
3861 while (dev != NULL) {
3862 current_task = dev->ev_sender;
3863 next = ISC_LIST_NEXT(dev, ev_link);
3865 if ((task == NULL) || (task == current_task)) {
3866 dev->result = ISC_R_CANCELED;
3867 send_senddone_event(sock, &dev);
3873 if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
3874 && !ISC_LIST_EMPTY(sock->accept_list)) {
3875 isc_socket_newconnev_t *dev;
3876 isc_socket_newconnev_t *next;
3877 isc_task_t *current_task;
3879 dev = ISC_LIST_HEAD(sock->accept_list);
3880 while (dev != NULL) {
3881 current_task = dev->ev_sender;
3882 next = ISC_LIST_NEXT(dev, ev_link);
3884 if ((task == NULL) || (task == current_task)) {
3886 ISC_LIST_UNLINK(sock->accept_list, dev,
3889 dev->newsocket->references--;
3890 free_socket(&dev->newsocket);
3892 dev->result = ISC_R_CANCELED;
3893 dev->ev_sender = sock;
3894 isc_task_sendanddetach(¤t_task,
3895 ISC_EVENT_PTR(&dev));
3903 * Connecting is not a list.
3905 if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
3906 && sock->connect_ev != NULL) {
3907 isc_socket_connev_t *dev;
3908 isc_task_t *current_task;
3910 INSIST(sock->connecting);
3911 sock->connecting = 0;
3913 dev = sock->connect_ev;
3914 current_task = dev->ev_sender;
3916 if ((task == NULL) || (task == current_task)) {
3917 sock->connect_ev = NULL;
3919 dev->result = ISC_R_CANCELED;
3920 dev->ev_sender = sock;
3921 isc_task_sendanddetach(¤t_task,
3922 ISC_EVENT_PTR(&dev));
3926 UNLOCK(&sock->lock);
3930 isc_socket_gettype(isc_socket_t *sock) {
3931 REQUIRE(VALID_SOCKET(sock));
3933 return (sock->type);
3937 isc_socket_isbound(isc_socket_t *sock) {
3941 val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
3942 UNLOCK(&sock->lock);
3948 isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
3949 #if defined(IPV6_V6ONLY)
3950 int onoff = yes ? 1 : 0;
3956 REQUIRE(VALID_SOCKET(sock));
3959 if (sock->pf == AF_INET6) {
3960 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
3961 (void *)&onoff, sizeof(onoff));
3966 #ifndef ISC_PLATFORM_USETHREADS
3968 isc__socketmgr_getfdsets(fd_set **readset, fd_set **writeset, int *maxfd) {
3969 if (socketmgr == NULL)
3972 /* Prepare duplicates of fd_sets, as select() will modify */
3973 memcpy(socketmgr->read_fds_copy, socketmgr->read_fds,
3974 socketmgr->fd_bufsize);
3975 memcpy(socketmgr->write_fds_copy, socketmgr->write_fds,
3976 socketmgr->fd_bufsize);
3977 *readset = socketmgr->read_fds_copy;
3978 *writeset = socketmgr->write_fds_copy;
3979 *maxfd = socketmgr->maxfd + 1;
3984 isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd) {
3985 isc_socketmgr_t *manager = socketmgr;
3987 if (manager == NULL)
3988 return (ISC_R_NOTFOUND);
3990 process_fds(manager, maxfd, readset, writeset);
3991 return (ISC_R_SUCCESS);
3993 #endif /* ISC_PLATFORM_USETHREADS */