]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/bind9/lib/isc/unix/socket.c
- Update ncurses to 5.7-20081102 (5.7 release) and build glue
[FreeBSD/FreeBSD.git] / contrib / bind9 / lib / isc / unix / socket.c
1 /*
2  * Copyright (C) 2004-2008  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1998-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: socket.c,v 1.237.18.29.10.6 2008/07/29 04:47:31 each Exp $ */
19
20 /*! \file */
21
22 #include <config.h>
23
24 #include <sys/param.h>
25 #include <sys/types.h>
26 #include <sys/socket.h>
27 #include <sys/stat.h>
28 #ifdef ISC_PLATFORM_HAVESYSUNH
29 #include <sys/un.h>
30 #endif
31 #include <sys/time.h>
32 #include <sys/uio.h>
33
34 #include <errno.h>
35 #include <fcntl.h>
36 #include <stddef.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <unistd.h>
40
41 #include <isc/buffer.h>
42 #include <isc/bufferlist.h>
43 #include <isc/condition.h>
44 #include <isc/formatcheck.h>
45 #include <isc/list.h>
46 #include <isc/log.h>
47 #include <isc/mem.h>
48 #include <isc/msgs.h>
49 #include <isc/mutex.h>
50 #include <isc/net.h>
51 #include <isc/once.h>
52 #include <isc/platform.h>
53 #include <isc/print.h>
54 #include <isc/region.h>
55 #include <isc/socket.h>
56 #include <isc/strerror.h>
57 #include <isc/task.h>
58 #include <isc/thread.h>
59 #include <isc/util.h>
60
61 #include "errno2result.h"
62
63 #ifndef ISC_PLATFORM_USETHREADS
64 #include "socket_p.h"
65 #endif /* ISC_PLATFORM_USETHREADS */
66
67 /*%
68  * Max number of open sockets.  In the vast majority of cases the default size  
69  * of FD_SETSIZE should be fine, and this constant should be increased only
70  * when absolutely necessary and possible, i.e., the server is exhausting all   
71  * available file descriptors (up to FD_SETSIZE) and the select() function
72  * and FD_xxx macros support larger values than FD_SETSIZE (which may not
73  * always by true, but we keep using some of them to ensure as much
74  * portability as possible).  Note also that overall server performance
75  * may be rather worsened with a larger value of this constant due to
76  * inherent scalability problems of select().
77  *
78  * As a special note, this value shouldn't have to be touched if
79  * this is a build for an authoritative only DNS server.
80  */
81
82 #ifndef ISC_SOCKET_FDSETSIZE
83 #define ISC_SOCKET_FDSETSIZE FD_SETSIZE
84 #endif
85
86 /*%
87  * Mac OS X needs a special definition to support larger values in select()
88  */
89 #if ISC_SOCKET_FDSETSIZE > FD_SETSIZE
90 #ifdef __APPLE__
91 #define _DARWIN_UNLIMITED_SELECT
92 #endif  /* __APPLE__ */
93 #endif
94
95 /*%
96  * Some systems define the socket length argument as an int, some as size_t,
97  * some as socklen_t.  This is here so it can be easily changed if needed.
98  */
99 #ifndef ISC_SOCKADDR_LEN_T
100 #define ISC_SOCKADDR_LEN_T unsigned int
101 #endif
102
103
104 #if defined(SO_BSDCOMPAT) && defined(__linux__)
105 #include <sys/utsname.h>
106 #endif
107
108 /*%
109  * Define what the possible "soft" errors can be.  These are non-fatal returns
110  * of various network related functions, like recv() and so on.
111  *
112  * For some reason, BSDI (and perhaps others) will sometimes return <0
113  * from recv() but will have errno==0.  This is broken, but we have to
114  * work around it here.
115  */
116 #define SOFT_ERROR(e)   ((e) == EAGAIN || \
117                          (e) == EWOULDBLOCK || \
118                          (e) == EINTR || \
119                          (e) == 0)
120
121 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
122
123 /*!<
124  * DLVL(90)  --  Function entry/exit and other tracing.
125  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
126  * DLVL(60)  --  Socket data send/receive
127  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
128  * DLVL(20)  --  Socket creation/destruction.
129  */
130 #define TRACE_LEVEL             90
131 #define CORRECTNESS_LEVEL       70
132 #define IOEVENT_LEVEL           60
133 #define EVENT_LEVEL             50
134 #define CREATION_LEVEL          20
135
136 #define TRACE           DLVL(TRACE_LEVEL)
137 #define CORRECTNESS     DLVL(CORRECTNESS_LEVEL)
138 #define IOEVENT         DLVL(IOEVENT_LEVEL)
139 #define EVENT           DLVL(EVENT_LEVEL)
140 #define CREATION        DLVL(CREATION_LEVEL)
141
142 typedef isc_event_t intev_t;
143
144 #define SOCKET_MAGIC            ISC_MAGIC('I', 'O', 'i', 'o')
145 #define VALID_SOCKET(t)         ISC_MAGIC_VALID(t, SOCKET_MAGIC)
146
147 /*!
148  * IPv6 control information.  If the socket is an IPv6 socket we want
149  * to collect the destination address and interface so the client can
150  * set them on outgoing packets.
151  */
152 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
153 #ifndef USE_CMSG
154 #define USE_CMSG        1
155 #endif
156 #endif
157
158 /*%
159  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
160  * a setsockopt() like interface to request timestamps, and if the OS
161  * doesn't do it for us, call gettimeofday() on every UDP receive?
162  */
163 #ifdef SO_TIMESTAMP
164 #ifndef USE_CMSG
165 #define USE_CMSG        1
166 #endif
167 #endif
168
169 /*%
170  * The size to raise the recieve buffer to (from BIND 8).
171  */
172 #define RCVBUFSIZE (32*1024)
173
174 /*%
175  * The number of times a send operation is repeated if the result is EINTR.
176  */
177 #define NRETRIES 10
178
179 struct isc_socket {
180         /* Not locked. */
181         unsigned int            magic;
182         isc_socketmgr_t        *manager;
183         isc_mutex_t             lock;
184         isc_sockettype_t        type;
185
186         /* Locked by socket lock. */
187         ISC_LINK(isc_socket_t)  link;
188         unsigned int            references;
189         int                     fd;
190         int                     pf;
191
192         ISC_LIST(isc_socketevent_t)             send_list;
193         ISC_LIST(isc_socketevent_t)             recv_list;
194         ISC_LIST(isc_socket_newconnev_t)        accept_list;
195         isc_socket_connev_t                    *connect_ev;
196
197         /*
198          * Internal events.  Posted when a descriptor is readable or
199          * writable.  These are statically allocated and never freed.
200          * They will be set to non-purgable before use.
201          */
202         intev_t                 readable_ev;
203         intev_t                 writable_ev;
204
205         isc_sockaddr_t          address;  /* remote address */
206
207         unsigned int            pending_recv : 1,
208                                 pending_send : 1,
209                                 pending_accept : 1,
210                                 listener : 1, /* listener socket */
211                                 connected : 1,
212                                 connecting : 1, /* connect pending */
213                                 bound : 1; /* bound to local addr */
214
215 #ifdef ISC_NET_RECVOVERFLOW
216         unsigned char           overflow; /* used for MSG_TRUNC fake */
217 #endif
218
219         char                    *recvcmsgbuf;
220         ISC_SOCKADDR_LEN_T      recvcmsgbuflen;
221         char                    *sendcmsgbuf;
222         ISC_SOCKADDR_LEN_T      sendcmsgbuflen;
223 };
224
225 #define SOCKET_MANAGER_MAGIC    ISC_MAGIC('I', 'O', 'm', 'g')
226 #define VALID_MANAGER(m)        ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
227
228 struct isc_socketmgr {
229         /* Not locked. */
230         unsigned int            magic;
231         isc_mem_t              *mctx;
232         isc_mutex_t             lock;
233         int                     fd_bufsize;
234         int                     fdsize;
235         /* Locked by manager lock. */
236         ISC_LIST(isc_socket_t)  socklist;
237         fd_set                  *read_fds;
238         fd_set                  *read_fds_copy;
239         fd_set                  *write_fds;
240         fd_set                  *write_fds_copy;
241         isc_socket_t           **fds;
242         int                     *fdstate;
243         int                     maxfd;
244         int                     reserved;       /* unlocked */
245 #ifdef ISC_PLATFORM_USETHREADS
246         isc_thread_t            watcher;
247         isc_condition_t         shutdown_ok;
248         int                     pipe_fds[2];
249 #else /* ISC_PLATFORM_USETHREADS */
250         unsigned int            refs;
251 #endif /* ISC_PLATFORM_USETHREADS */
252 };
253
254 #ifndef ISC_PLATFORM_USETHREADS
255 static isc_socketmgr_t *socketmgr = NULL;
256 #endif /* ISC_PLATFORM_USETHREADS */
257
258 #define CLOSED          0       /* this one must be zero */
259 #define MANAGED         1
260 #define CLOSE_PENDING   2
261
262 /*
263  * send() and recv() iovec counts
264  */
265 #define MAXSCATTERGATHER_SEND   (ISC_SOCKET_MAXSCATTERGATHER)
266 #ifdef ISC_NET_RECVOVERFLOW
267 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER + 1)
268 #else
269 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER)
270 #endif
271
272 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
273 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
274 static void free_socket(isc_socket_t **);
275 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
276                                     isc_socket_t **);
277 static void destroy(isc_socket_t **);
278 static void internal_accept(isc_task_t *, isc_event_t *);
279 static void internal_connect(isc_task_t *, isc_event_t *);
280 static void internal_recv(isc_task_t *, isc_event_t *);
281 static void internal_send(isc_task_t *, isc_event_t *);
282 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
283 static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
284                               struct msghdr *, struct iovec *, size_t *);
285 static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
286                               struct msghdr *, struct iovec *, size_t *);
287 static void cleanup_fdsets(isc_socketmgr_t *, isc_mem_t *);
288 static isc_result_t create_fdsets(isc_socketmgr_t *, isc_mem_t *);
289
290 #define SELECT_POKE_SHUTDOWN            (-1)
291 #define SELECT_POKE_NOTHING             (-2)
292 #define SELECT_POKE_READ                (-3)
293 #define SELECT_POKE_ACCEPT              (-3) /*%< Same as _READ */
294 #define SELECT_POKE_WRITE               (-4)
295 #define SELECT_POKE_CONNECT             (-4) /*%< Same as _WRITE */
296 #define SELECT_POKE_CLOSE               (-5)
297
298 #define SOCK_DEAD(s)                    ((s)->references == 0)
299
300 static void
301 manager_log(isc_socketmgr_t *sockmgr,
302             isc_logcategory_t *category, isc_logmodule_t *module, int level,
303             const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
304 static void
305 manager_log(isc_socketmgr_t *sockmgr,
306             isc_logcategory_t *category, isc_logmodule_t *module, int level,
307             const char *fmt, ...)
308 {
309         char msgbuf[2048];
310         va_list ap;
311
312         if (! isc_log_wouldlog(isc_lctx, level))
313                 return;
314
315         va_start(ap, fmt);
316         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
317         va_end(ap);
318
319         isc_log_write(isc_lctx, category, module, level,
320                       "sockmgr %p: %s", sockmgr, msgbuf);
321 }
322
323 static void
324 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
325            isc_logcategory_t *category, isc_logmodule_t *module, int level,
326            isc_msgcat_t *msgcat, int msgset, int message,
327            const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
328 static void
329 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
330            isc_logcategory_t *category, isc_logmodule_t *module, int level,
331            isc_msgcat_t *msgcat, int msgset, int message,
332            const char *fmt, ...)
333 {
334         char msgbuf[2048];
335         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
336         va_list ap;
337
338         if (! isc_log_wouldlog(isc_lctx, level))
339                 return;
340
341         va_start(ap, fmt);
342         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
343         va_end(ap);
344
345         if (address == NULL) {
346                 isc_log_iwrite(isc_lctx, category, module, level,
347                                msgcat, msgset, message,
348                                "socket %p: %s", sock, msgbuf);
349         } else {
350                 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
351                 isc_log_iwrite(isc_lctx, category, module, level,
352                                msgcat, msgset, message,
353                                "socket %p %s: %s", sock, peerbuf, msgbuf);
354         }
355 }
356
357 static void
358 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
359         isc_socket_t *sock;
360
361         /*
362          * This is a wakeup on a socket.  If the socket is not in the
363          * process of being closed, start watching it for either reads
364          * or writes.
365          */
366
367         INSIST(fd >= 0 && fd < manager->fdsize);
368
369         if (manager->fdstate[fd] == CLOSE_PENDING) {
370                 manager->fdstate[fd] = CLOSED;
371                 FD_CLR(fd, manager->read_fds);
372                 FD_CLR(fd, manager->write_fds);
373                 (void)close(fd);
374                 return;
375         }
376         if (manager->fdstate[fd] != MANAGED)
377                 return;
378
379         sock = manager->fds[fd];
380
381         /*
382          * Set requested bit.
383          */
384         if (msg == SELECT_POKE_READ)
385                 FD_SET(sock->fd, manager->read_fds);
386         if (msg == SELECT_POKE_WRITE)
387                 FD_SET(sock->fd, manager->write_fds);
388 }
389
390 #ifdef ISC_PLATFORM_USETHREADS
391 /*
392  * Poke the select loop when there is something for us to do.
393  * The write is required (by POSIX) to complete.  That is, we
394  * will not get partial writes.
395  */
396 static void
397 select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
398         int cc;
399         int buf[2];
400         char strbuf[ISC_STRERRORSIZE];
401
402         buf[0] = fd;
403         buf[1] = msg;
404
405         do {
406                 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
407 #ifdef ENOSR
408                 /*
409                  * Treat ENOSR as EAGAIN but loop slowly as it is
410                  * unlikely to clear fast.
411                  */
412                 if (cc < 0 && errno == ENOSR) {
413                         sleep(1);
414                         errno = EAGAIN;
415                 }
416 #endif
417         } while (cc < 0 && SOFT_ERROR(errno));
418
419         if (cc < 0) {
420                 isc__strerror(errno, strbuf, sizeof(strbuf));
421                 FATAL_ERROR(__FILE__, __LINE__,
422                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
423                                            ISC_MSG_WRITEFAILED,
424                                            "write() failed "
425                                            "during watcher poke: %s"),
426                             strbuf);
427         }
428
429         INSIST(cc == sizeof(buf));
430 }
431
432 /*
433  * Read a message on the internal fd.
434  */
435 static void
436 select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
437         int buf[2];
438         int cc;
439         char strbuf[ISC_STRERRORSIZE];
440
441         cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
442         if (cc < 0) {
443                 *msg = SELECT_POKE_NOTHING;
444                 *fd = -1;       /* Silence compiler. */
445                 if (SOFT_ERROR(errno))
446                         return;
447
448                 isc__strerror(errno, strbuf, sizeof(strbuf));
449                 FATAL_ERROR(__FILE__, __LINE__,
450                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
451                                            ISC_MSG_READFAILED,
452                                            "read() failed "
453                                            "during watcher poke: %s"),
454                             strbuf);
455                 
456                 return;
457         }
458         INSIST(cc == sizeof(buf));
459
460         *fd = buf[0];
461         *msg = buf[1];
462 }
463 #else /* ISC_PLATFORM_USETHREADS */
464 /*
465  * Update the state of the socketmgr when something changes.
466  */
467 static void
468 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
469         if (msg == SELECT_POKE_SHUTDOWN)
470                 return;
471         else if (fd >= 0)
472                 wakeup_socket(manager, fd, msg);
473         return;
474 }
475 #endif /* ISC_PLATFORM_USETHREADS */
476
477 /*
478  * Make a fd non-blocking.
479  */
480 static isc_result_t
481 make_nonblock(int fd) {
482         int ret;
483         int flags;
484         char strbuf[ISC_STRERRORSIZE];
485 #ifdef USE_FIONBIO_IOCTL
486         int on = 1;
487
488         ret = ioctl(fd, FIONBIO, (char *)&on);
489 #else
490         flags = fcntl(fd, F_GETFL, 0);
491         flags |= PORT_NONBLOCK;
492         ret = fcntl(fd, F_SETFL, flags);
493 #endif
494
495         if (ret == -1) {
496                 isc__strerror(errno, strbuf, sizeof(strbuf));
497                 UNEXPECTED_ERROR(__FILE__, __LINE__,
498 #ifdef USE_FIONBIO_IOCTL
499                                  "ioctl(%d, FIONBIO, &on): %s", fd,
500 #else
501                                  "fcntl(%d, F_SETFL, %d): %s", fd, flags,
502 #endif
503                                  strbuf);
504
505                 return (ISC_R_UNEXPECTED);
506         }
507
508         return (ISC_R_SUCCESS);
509 }
510
511 #ifdef USE_CMSG
512 /*
513  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
514  * In order to ensure as much portability as possible, we provide wrapper
515  * functions of these macros.
516  * Note that cmsg_space() could run slow on OSes that do not have
517  * CMSG_SPACE.
518  */
519 static inline ISC_SOCKADDR_LEN_T
520 cmsg_len(ISC_SOCKADDR_LEN_T len) {
521 #ifdef CMSG_LEN
522         return (CMSG_LEN(len));
523 #else
524         ISC_SOCKADDR_LEN_T hdrlen;
525
526         /*
527          * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
528          * is correct.
529          */
530         hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
531         return (hdrlen + len);
532 #endif
533 }
534
535 static inline ISC_SOCKADDR_LEN_T
536 cmsg_space(ISC_SOCKADDR_LEN_T len) {
537 #ifdef CMSG_SPACE
538         return (CMSG_SPACE(len));
539 #else
540         struct msghdr msg;
541         struct cmsghdr *cmsgp;
542         /*
543          * XXX: The buffer length is an ad-hoc value, but should be enough
544          * in a practical sense.
545          */
546         char dummybuf[sizeof(struct cmsghdr) + 1024];
547
548         memset(&msg, 0, sizeof(msg));
549         msg.msg_control = dummybuf;
550         msg.msg_controllen = sizeof(dummybuf);
551
552         cmsgp = (struct cmsghdr *)dummybuf;
553         cmsgp->cmsg_len = cmsg_len(len);
554
555         cmsgp = CMSG_NXTHDR(&msg, cmsgp);
556         if (cmsgp != NULL)
557                 return ((char *)cmsgp - (char *)msg.msg_control);
558         else
559                 return (0);
560 #endif  
561 }
562 #endif /* USE_CMSG */
563
564 /*
565  * Process control messages received on a socket.
566  */
567 static void
568 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
569 #ifdef USE_CMSG
570         struct cmsghdr *cmsgp;
571 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
572         struct in6_pktinfo *pktinfop;
573 #endif
574 #ifdef SO_TIMESTAMP
575         struct timeval *timevalp;
576 #endif
577 #endif
578
579         /*
580          * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
581          * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
582          * They are all here, outside of the CPP tests, because it is
583          * more consistent with the usual ISC coding style.
584          */
585         UNUSED(sock);
586         UNUSED(msg);
587         UNUSED(dev);
588
589 #ifdef ISC_NET_BSD44MSGHDR
590
591 #ifdef MSG_TRUNC
592         if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
593                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
594 #endif
595
596 #ifdef MSG_CTRUNC
597         if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
598                 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
599 #endif
600
601 #ifndef USE_CMSG
602         return;
603 #else
604         if (msg->msg_controllen == 0U || msg->msg_control == NULL)
605                 return;
606
607 #ifdef SO_TIMESTAMP
608         timevalp = NULL;
609 #endif
610 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
611         pktinfop = NULL;
612 #endif
613
614         cmsgp = CMSG_FIRSTHDR(msg);
615         while (cmsgp != NULL) {
616                 socket_log(sock, NULL, TRACE,
617                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
618                            "processing cmsg %p", cmsgp);
619
620 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
621                 if (cmsgp->cmsg_level == IPPROTO_IPV6
622                     && cmsgp->cmsg_type == IPV6_PKTINFO) {
623
624                         pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
625                         memcpy(&dev->pktinfo, pktinfop,
626                                sizeof(struct in6_pktinfo));
627                         dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
628                         socket_log(sock, NULL, TRACE,
629                                    isc_msgcat, ISC_MSGSET_SOCKET,
630                                    ISC_MSG_IFRECEIVED,
631                                    "interface received on ifindex %u",
632                                    dev->pktinfo.ipi6_ifindex);
633                         if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
634                                 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;                         
635                         goto next;
636                 }
637 #endif
638
639 #ifdef SO_TIMESTAMP
640                 if (cmsgp->cmsg_level == SOL_SOCKET
641                     && cmsgp->cmsg_type == SCM_TIMESTAMP) {
642                         timevalp = (struct timeval *)CMSG_DATA(cmsgp);
643                         dev->timestamp.seconds = timevalp->tv_sec;
644                         dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
645                         dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
646                         goto next;
647                 }
648 #endif
649
650         next:
651                 cmsgp = CMSG_NXTHDR(msg, cmsgp);
652         }
653 #endif /* USE_CMSG */
654
655 #endif /* ISC_NET_BSD44MSGHDR */
656 }
657
658 /*
659  * Construct an iov array and attach it to the msghdr passed in.  This is
660  * the SEND constructor, which will use the used region of the buffer
661  * (if using a buffer list) or will use the internal region (if a single
662  * buffer I/O is requested).
663  *
664  * Nothing can be NULL, and the done event must list at least one buffer
665  * on the buffer linked list for this function to be meaningful.
666  *
667  * If write_countp != NULL, *write_countp will hold the number of bytes
668  * this transaction can send.
669  */
670 static void
671 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
672                   struct msghdr *msg, struct iovec *iov, size_t *write_countp)
673 {
674         unsigned int iovcount;
675         isc_buffer_t *buffer;
676         isc_region_t used;
677         size_t write_count;
678         size_t skip_count;
679
680         memset(msg, 0, sizeof(*msg));
681
682         if (sock->type == isc_sockettype_udp) {
683                 msg->msg_name = (void *)&dev->address.type.sa;
684                 msg->msg_namelen = dev->address.length;
685         } else {
686                 msg->msg_name = NULL;
687                 msg->msg_namelen = 0;
688         }
689
690         buffer = ISC_LIST_HEAD(dev->bufferlist);
691         write_count = 0;
692         iovcount = 0;
693
694         /*
695          * Single buffer I/O?  Skip what we've done so far in this region.
696          */
697         if (buffer == NULL) {
698                 write_count = dev->region.length - dev->n;
699                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
700                 iov[0].iov_len = write_count;
701                 iovcount = 1;
702
703                 goto config;
704         }
705
706         /*
707          * Multibuffer I/O.
708          * Skip the data in the buffer list that we have already written.
709          */
710         skip_count = dev->n;
711         while (buffer != NULL) {
712                 REQUIRE(ISC_BUFFER_VALID(buffer));
713                 if (skip_count < isc_buffer_usedlength(buffer))
714                         break;
715                 skip_count -= isc_buffer_usedlength(buffer);
716                 buffer = ISC_LIST_NEXT(buffer, link);
717         }
718
719         while (buffer != NULL) {
720                 INSIST(iovcount < MAXSCATTERGATHER_SEND);
721
722                 isc_buffer_usedregion(buffer, &used);
723
724                 if (used.length > 0) {
725                         iov[iovcount].iov_base = (void *)(used.base
726                                                           + skip_count);
727                         iov[iovcount].iov_len = used.length - skip_count;
728                         write_count += (used.length - skip_count);
729                         skip_count = 0;
730                         iovcount++;
731                 }
732                 buffer = ISC_LIST_NEXT(buffer, link);
733         }
734
735         INSIST(skip_count == 0U);
736
737  config:
738         msg->msg_iov = iov;
739         msg->msg_iovlen = iovcount;
740
741 #ifdef ISC_NET_BSD44MSGHDR
742         msg->msg_control = NULL;
743         msg->msg_controllen = 0;
744         msg->msg_flags = 0;
745 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
746         if ((sock->type == isc_sockettype_udp)
747             && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
748                 struct cmsghdr *cmsgp;
749                 struct in6_pktinfo *pktinfop;
750
751                 socket_log(sock, NULL, TRACE,
752                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
753                            "sendto pktinfo data, ifindex %u",
754                            dev->pktinfo.ipi6_ifindex);
755
756                 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
757                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
758                 msg->msg_control = (void *)sock->sendcmsgbuf;
759
760                 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
761                 cmsgp->cmsg_level = IPPROTO_IPV6;
762                 cmsgp->cmsg_type = IPV6_PKTINFO;
763                 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
764                 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
765                 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
766         }
767 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
768 #else /* ISC_NET_BSD44MSGHDR */
769         msg->msg_accrights = NULL;
770         msg->msg_accrightslen = 0;
771 #endif /* ISC_NET_BSD44MSGHDR */
772
773         if (write_countp != NULL)
774                 *write_countp = write_count;
775 }
776
777 /*
778  * Construct an iov array and attach it to the msghdr passed in.  This is
779  * the RECV constructor, which will use the avialable region of the buffer
780  * (if using a buffer list) or will use the internal region (if a single
781  * buffer I/O is requested).
782  *
783  * Nothing can be NULL, and the done event must list at least one buffer
784  * on the buffer linked list for this function to be meaningful.
785  *
786  * If read_countp != NULL, *read_countp will hold the number of bytes
787  * this transaction can receive.
788  */
789 static void
790 build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
791                   struct msghdr *msg, struct iovec *iov, size_t *read_countp)
792 {
793         unsigned int iovcount;
794         isc_buffer_t *buffer;
795         isc_region_t available;
796         size_t read_count;
797
798         memset(msg, 0, sizeof(struct msghdr));
799
800         if (sock->type == isc_sockettype_udp) {
801                 memset(&dev->address, 0, sizeof(dev->address));
802 #ifdef BROKEN_RECVMSG
803                 if (sock->pf == AF_INET) {
804                         msg->msg_name = (void *)&dev->address.type.sin;
805                         msg->msg_namelen = sizeof(dev->address.type.sin6);
806                 } else if (sock->pf == AF_INET6) {
807                         msg->msg_name = (void *)&dev->address.type.sin6;
808                         msg->msg_namelen = sizeof(dev->address.type.sin6);
809 #ifdef ISC_PLATFORM_HAVESYSUNH
810                 } else if (sock->pf == AF_UNIX) {
811                         msg->msg_name = (void *)&dev->address.type.sunix;
812                         msg->msg_namelen = sizeof(dev->address.type.sunix);
813 #endif
814                 } else {
815                         msg->msg_name = (void *)&dev->address.type.sa;
816                         msg->msg_namelen = sizeof(dev->address.type);
817                 }
818 #else
819                 msg->msg_name = (void *)&dev->address.type.sa;
820                 msg->msg_namelen = sizeof(dev->address.type);
821 #endif
822 #ifdef ISC_NET_RECVOVERFLOW
823                 /* If needed, steal one iovec for overflow detection. */
824                 maxiov--;
825 #endif
826         } else { /* TCP */
827                 msg->msg_name = NULL;
828                 msg->msg_namelen = 0;
829                 dev->address = sock->address;
830         }
831
832         buffer = ISC_LIST_HEAD(dev->bufferlist);
833         read_count = 0;
834
835         /*
836          * Single buffer I/O?  Skip what we've done so far in this region.
837          */
838         if (buffer == NULL) {
839                 read_count = dev->region.length - dev->n;
840                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
841                 iov[0].iov_len = read_count;
842                 iovcount = 1;
843
844                 goto config;
845         }
846
847         /*
848          * Multibuffer I/O.
849          * Skip empty buffers.
850          */
851         while (buffer != NULL) {
852                 REQUIRE(ISC_BUFFER_VALID(buffer));
853                 if (isc_buffer_availablelength(buffer) != 0)
854                         break;
855                 buffer = ISC_LIST_NEXT(buffer, link);
856         }
857
858         iovcount = 0;
859         while (buffer != NULL) {
860                 INSIST(iovcount < MAXSCATTERGATHER_RECV);
861
862                 isc_buffer_availableregion(buffer, &available);
863
864                 if (available.length > 0) {
865                         iov[iovcount].iov_base = (void *)(available.base);
866                         iov[iovcount].iov_len = available.length;
867                         read_count += available.length;
868                         iovcount++;
869                 }
870                 buffer = ISC_LIST_NEXT(buffer, link);
871         }
872
873  config:
874
875         /*
876          * If needed, set up to receive that one extra byte.  Note that
877          * we know there is at least one iov left, since we stole it
878          * at the top of this function.
879          */
880 #ifdef ISC_NET_RECVOVERFLOW
881         if (sock->type == isc_sockettype_udp) {
882                 iov[iovcount].iov_base = (void *)(&sock->overflow);
883                 iov[iovcount].iov_len = 1;
884                 iovcount++;
885         }
886 #endif
887
888         msg->msg_iov = iov;
889         msg->msg_iovlen = iovcount;
890
891 #ifdef ISC_NET_BSD44MSGHDR
892         msg->msg_control = NULL;
893         msg->msg_controllen = 0;
894         msg->msg_flags = 0;
895 #if defined(USE_CMSG)
896         if (sock->type == isc_sockettype_udp) {
897                 msg->msg_control = sock->recvcmsgbuf;
898                 msg->msg_controllen = sock->recvcmsgbuflen;
899         }
900 #endif /* USE_CMSG */
901 #else /* ISC_NET_BSD44MSGHDR */
902         msg->msg_accrights = NULL;
903         msg->msg_accrightslen = 0;
904 #endif /* ISC_NET_BSD44MSGHDR */
905
906         if (read_countp != NULL)
907                 *read_countp = read_count;
908 }
909
910 static void
911 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
912                 isc_socketevent_t *dev)
913 {
914         if (sock->type == isc_sockettype_udp) {
915                 if (address != NULL)
916                         dev->address = *address;
917                 else
918                         dev->address = sock->address;
919         } else if (sock->type == isc_sockettype_tcp) {
920                 INSIST(address == NULL);
921                 dev->address = sock->address;
922         }
923 }
924
925 static void
926 destroy_socketevent(isc_event_t *event) {
927         isc_socketevent_t *ev = (isc_socketevent_t *)event;
928
929         INSIST(ISC_LIST_EMPTY(ev->bufferlist));
930
931         (ev->destroy)(event);
932 }
933
934 static isc_socketevent_t *
935 allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
936                      isc_taskaction_t action, const void *arg)
937 {
938         isc_socketevent_t *ev;
939
940         ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
941                                                      sock, eventtype,
942                                                      action, arg,
943                                                      sizeof(*ev));
944
945         if (ev == NULL)
946                 return (NULL);
947
948         ev->result = ISC_R_UNEXPECTED;
949         ISC_LINK_INIT(ev, ev_link);
950         ISC_LIST_INIT(ev->bufferlist);
951         ev->region.base = NULL;
952         ev->n = 0;
953         ev->offset = 0;
954         ev->attributes = 0;
955         ev->destroy = ev->ev_destroy;
956         ev->ev_destroy = destroy_socketevent;
957
958         return (ev);
959 }
960
961 #if defined(ISC_SOCKET_DEBUG)
962 static void
963 dump_msg(struct msghdr *msg) {
964         unsigned int i;
965
966         printf("MSGHDR %p\n", msg);
967         printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
968         printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
969         for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
970                 printf("\t\t%d\tbase %p, len %d\n", i,
971                        msg->msg_iov[i].iov_base,
972                        msg->msg_iov[i].iov_len);
973 #ifdef ISC_NET_BSD44MSGHDR
974         printf("\tcontrol %p, controllen %d\n", msg->msg_control,
975                msg->msg_controllen);
976 #endif
977 }
978 #endif
979
980 #define DOIO_SUCCESS            0       /* i/o ok, event sent */
981 #define DOIO_SOFT               1       /* i/o ok, soft error, no event sent */
982 #define DOIO_HARD               2       /* i/o error, event sent */
983 #define DOIO_EOF                3       /* EOF, no event sent */
984
985 static int
986 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
987         int cc;
988         struct iovec iov[MAXSCATTERGATHER_RECV];
989         size_t read_count;
990         size_t actual_count;
991         struct msghdr msghdr;
992         isc_buffer_t *buffer;
993         int recv_errno;
994         char strbuf[ISC_STRERRORSIZE];
995
996         build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
997
998 #if defined(ISC_SOCKET_DEBUG)
999         dump_msg(&msghdr);
1000 #endif
1001
1002         cc = recvmsg(sock->fd, &msghdr, 0);
1003         recv_errno = errno;
1004
1005 #if defined(ISC_SOCKET_DEBUG)
1006         dump_msg(&msghdr);
1007 #endif
1008
1009         if (cc < 0) {
1010                 if (SOFT_ERROR(recv_errno))
1011                         return (DOIO_SOFT);
1012
1013                 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1014                         isc__strerror(recv_errno, strbuf, sizeof(strbuf));
1015                         socket_log(sock, NULL, IOEVENT,
1016                                    isc_msgcat, ISC_MSGSET_SOCKET,
1017                                    ISC_MSG_DOIORECV, 
1018                                   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1019                                    sock->fd, cc, recv_errno, strbuf);
1020                 }
1021
1022 #define SOFT_OR_HARD(_system, _isc) \
1023         if (recv_errno == _system) { \
1024                 if (sock->connected) { \
1025                         dev->result = _isc; \
1026                         return (DOIO_HARD); \
1027                 } \
1028                 return (DOIO_SOFT); \
1029         }
1030 #define ALWAYS_HARD(_system, _isc) \
1031         if (recv_errno == _system) { \
1032                 dev->result = _isc; \
1033                 return (DOIO_HARD); \
1034         }
1035
1036                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1037                 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1038                 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1039                 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1040                 /* HPUX 11.11 can return EADDRNOTAVAIL. */
1041                 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1042                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1043
1044 #undef SOFT_OR_HARD
1045 #undef ALWAYS_HARD
1046
1047                 dev->result = isc__errno2result(recv_errno);
1048                 return (DOIO_HARD);
1049         }
1050
1051         /*
1052          * On TCP, zero length reads indicate EOF, while on
1053          * UDP, zero length reads are perfectly valid, although
1054          * strange.
1055          */
1056         if ((sock->type == isc_sockettype_tcp) && (cc == 0))
1057                 return (DOIO_EOF);
1058
1059         if (sock->type == isc_sockettype_udp) {
1060                 dev->address.length = msghdr.msg_namelen;
1061                 if (isc_sockaddr_getport(&dev->address) == 0) {
1062                         if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1063                                 socket_log(sock, &dev->address, IOEVENT,
1064                                            isc_msgcat, ISC_MSGSET_SOCKET,
1065                                            ISC_MSG_ZEROPORT, 
1066                                            "dropping source port zero packet");
1067                         }
1068                         return (DOIO_SOFT);
1069                 }
1070         }
1071
1072         socket_log(sock, &dev->address, IOEVENT,
1073                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1074                    "packet received correctly");
1075
1076         /*
1077          * Overflow bit detection.  If we received MORE bytes than we should,
1078          * this indicates an overflow situation.  Set the flag in the
1079          * dev entry and adjust how much we read by one.
1080          */
1081 #ifdef ISC_NET_RECVOVERFLOW
1082         if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1083                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1084                 cc--;
1085         }
1086 #endif
1087
1088         /*
1089          * If there are control messages attached, run through them and pull
1090          * out the interesting bits.
1091          */
1092         if (sock->type == isc_sockettype_udp)
1093                 process_cmsg(sock, &msghdr, dev);
1094
1095         /*
1096          * update the buffers (if any) and the i/o count
1097          */
1098         dev->n += cc;
1099         actual_count = cc;
1100         buffer = ISC_LIST_HEAD(dev->bufferlist);
1101         while (buffer != NULL && actual_count > 0U) {
1102                 REQUIRE(ISC_BUFFER_VALID(buffer));
1103                 if (isc_buffer_availablelength(buffer) <= actual_count) {
1104                         actual_count -= isc_buffer_availablelength(buffer);
1105                         isc_buffer_add(buffer,
1106                                        isc_buffer_availablelength(buffer));
1107                 } else {
1108                         isc_buffer_add(buffer, actual_count);
1109                         actual_count = 0;
1110                         break;
1111                 }
1112                 buffer = ISC_LIST_NEXT(buffer, link);
1113                 if (buffer == NULL) {
1114                         INSIST(actual_count == 0U);
1115                 }
1116         }
1117
1118         /*
1119          * If we read less than we expected, update counters,
1120          * and let the upper layer poke the descriptor.
1121          */
1122         if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1123                 return (DOIO_SOFT);
1124
1125         /*
1126          * Full reads are posted, or partials if partials are ok.
1127          */
1128         dev->result = ISC_R_SUCCESS;
1129         return (DOIO_SUCCESS);
1130 }
1131
1132 /*
1133  * Returns:
1134  *      DOIO_SUCCESS    The operation succeeded.  dev->result contains
1135  *                      ISC_R_SUCCESS.
1136  *
1137  *      DOIO_HARD       A hard or unexpected I/O error was encountered.
1138  *                      dev->result contains the appropriate error.
1139  *
1140  *      DOIO_SOFT       A soft I/O error was encountered.  No senddone
1141  *                      event was sent.  The operation should be retried.
1142  *
1143  *      No other return values are possible.
1144  */
1145 static int
1146 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1147         int cc;
1148         struct iovec iov[MAXSCATTERGATHER_SEND];
1149         size_t write_count;
1150         struct msghdr msghdr;
1151         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1152         int attempts = 0;
1153         int send_errno;
1154         char strbuf[ISC_STRERRORSIZE];
1155
1156         build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1157
1158  resend:
1159         cc = sendmsg(sock->fd, &msghdr, 0);
1160         send_errno = errno;
1161
1162         /*
1163          * Check for error or block condition.
1164          */
1165         if (cc < 0) {
1166                 if (send_errno == EINTR && ++attempts < NRETRIES)
1167                         goto resend;
1168
1169                 if (SOFT_ERROR(send_errno))
1170                         return (DOIO_SOFT);
1171
1172 #define SOFT_OR_HARD(_system, _isc) \
1173         if (send_errno == _system) { \
1174                 if (sock->connected) { \
1175                         dev->result = _isc; \
1176                         return (DOIO_HARD); \
1177                 } \
1178                 return (DOIO_SOFT); \
1179         }
1180 #define ALWAYS_HARD(_system, _isc) \
1181         if (send_errno == _system) { \
1182                 dev->result = _isc; \
1183                 return (DOIO_HARD); \
1184         }
1185
1186                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1187                 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1188                 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1189                 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1190                 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1191 #ifdef EHOSTDOWN
1192                 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1193 #endif
1194                 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1195                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1196                 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1197                 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1198                 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1199
1200 #undef SOFT_OR_HARD
1201 #undef ALWAYS_HARD
1202
1203                 /*
1204                  * The other error types depend on whether or not the
1205                  * socket is UDP or TCP.  If it is UDP, some errors
1206                  * that we expect to be fatal under TCP are merely
1207                  * annoying, and are really soft errors.
1208                  *
1209                  * However, these soft errors are still returned as
1210                  * a status.
1211                  */
1212                 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1213                 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1214                 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1215                                  addrbuf, strbuf);
1216                 dev->result = isc__errno2result(send_errno);
1217                 return (DOIO_HARD);
1218         }
1219
1220         if (cc == 0)
1221                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1222                                  "internal_send: send() %s 0",
1223                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1224                                                 ISC_MSG_RETURNED, "returned"));
1225
1226         /*
1227          * If we write less than we expected, update counters, poke.
1228          */
1229         dev->n += cc;
1230         if ((size_t)cc != write_count)
1231                 return (DOIO_SOFT);
1232
1233         /*
1234          * Exactly what we wanted to write.  We're done with this
1235          * entry.  Post its completion event.
1236          */
1237         dev->result = ISC_R_SUCCESS;
1238         return (DOIO_SUCCESS);
1239 }
1240
1241 /*
1242  * Kill.
1243  *
1244  * Caller must ensure that the socket is not locked and no external
1245  * references exist.
1246  */
1247 static void
1248 destroy(isc_socket_t **sockp) {
1249         isc_socket_t *sock = *sockp;
1250         isc_socketmgr_t *manager = sock->manager;
1251
1252         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1253                    ISC_MSG_DESTROYING, "destroying");
1254
1255         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1256         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1257         INSIST(ISC_LIST_EMPTY(sock->send_list));
1258         INSIST(sock->connect_ev == NULL);
1259         REQUIRE(sock->fd >= 0 && sock->fd < (int)manager->fdsize);
1260
1261         LOCK(&manager->lock);
1262
1263         /*
1264          * No one has this socket open, so the watcher doesn't have to be
1265          * poked, and the socket doesn't have to be locked.
1266          */
1267         manager->fds[sock->fd] = NULL;
1268         manager->fdstate[sock->fd] = CLOSE_PENDING;
1269         select_poke(manager, sock->fd, SELECT_POKE_CLOSE);
1270         ISC_LIST_UNLINK(manager->socklist, sock, link);
1271
1272 #ifdef ISC_PLATFORM_USETHREADS
1273         if (ISC_LIST_EMPTY(manager->socklist))
1274                 SIGNAL(&manager->shutdown_ok);
1275 #endif /* ISC_PLATFORM_USETHREADS */
1276
1277         /*
1278          * XXX should reset manager->maxfd here
1279          */
1280
1281         UNLOCK(&manager->lock);
1282
1283         free_socket(sockp);
1284 }
1285
1286 static isc_result_t
1287 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1288                 isc_socket_t **socketp)
1289 {
1290         isc_socket_t *sock;
1291         isc_result_t result;
1292         ISC_SOCKADDR_LEN_T cmsgbuflen;
1293
1294         sock = isc_mem_get(manager->mctx, sizeof(*sock));
1295
1296         if (sock == NULL)
1297                 return (ISC_R_NOMEMORY);
1298
1299         result = ISC_R_UNEXPECTED;
1300
1301         sock->magic = 0;
1302         sock->references = 0;
1303
1304         sock->manager = manager;
1305         sock->type = type;
1306         sock->fd = -1;
1307
1308         ISC_LINK_INIT(sock, link);
1309
1310         sock->recvcmsgbuf = NULL;
1311         sock->sendcmsgbuf = NULL;
1312
1313         /*
1314          * set up cmsg buffers
1315          */
1316         cmsgbuflen = 0;
1317 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1318         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1319 #endif
1320 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
1321         cmsgbuflen += cmsg_space(sizeof(struct timeval));
1322 #endif
1323         sock->recvcmsgbuflen = cmsgbuflen;
1324         if (sock->recvcmsgbuflen != 0U) {
1325                 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1326                 if (sock->recvcmsgbuf == NULL)
1327                         goto error;
1328         }
1329
1330         cmsgbuflen = 0;
1331 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1332         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1333 #endif
1334         sock->sendcmsgbuflen = cmsgbuflen;
1335         if (sock->sendcmsgbuflen != 0U) {
1336                 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1337                 if (sock->sendcmsgbuf == NULL)
1338                         goto error;
1339         }
1340
1341         /*
1342          * set up list of readers and writers to be initially empty
1343          */
1344         ISC_LIST_INIT(sock->recv_list);
1345         ISC_LIST_INIT(sock->send_list);
1346         ISC_LIST_INIT(sock->accept_list);
1347         sock->connect_ev = NULL;
1348         sock->pending_recv = 0;
1349         sock->pending_send = 0;
1350         sock->pending_accept = 0;
1351         sock->listener = 0;
1352         sock->connected = 0;
1353         sock->connecting = 0;
1354         sock->bound = 0;
1355
1356         /*
1357          * initialize the lock
1358          */
1359         result = isc_mutex_init(&sock->lock);
1360         if (result != ISC_R_SUCCESS) {
1361                 sock->magic = 0;
1362                 goto error;
1363         }
1364
1365         /*
1366          * Initialize readable and writable events
1367          */
1368         ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1369                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1370                        NULL, sock, sock, NULL, NULL);
1371         ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1372                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1373                        NULL, sock, sock, NULL, NULL);
1374
1375         sock->magic = SOCKET_MAGIC;
1376         *socketp = sock;
1377
1378         return (ISC_R_SUCCESS);
1379
1380  error:
1381         if (sock->recvcmsgbuf != NULL)
1382                 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
1383                             sock->recvcmsgbuflen);
1384         if (sock->sendcmsgbuf != NULL)
1385                 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
1386                             sock->sendcmsgbuflen);
1387         isc_mem_put(manager->mctx, sock, sizeof(*sock));
1388
1389         return (result);
1390 }
1391
1392 /*
1393  * This event requires that the various lists be empty, that the reference
1394  * count be 1, and that the magic number is valid.  The other socket bits,
1395  * like the lock, must be initialized as well.  The fd associated must be
1396  * marked as closed, by setting it to -1 on close, or this routine will
1397  * also close the socket.
1398  */
1399 static void
1400 free_socket(isc_socket_t **socketp) {
1401         isc_socket_t *sock = *socketp;
1402
1403         INSIST(sock->references == 0);
1404         INSIST(VALID_SOCKET(sock));
1405         INSIST(!sock->connecting);
1406         INSIST(!sock->pending_recv);
1407         INSIST(!sock->pending_send);
1408         INSIST(!sock->pending_accept);
1409         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1410         INSIST(ISC_LIST_EMPTY(sock->send_list));
1411         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1412         INSIST(!ISC_LINK_LINKED(sock, link));
1413
1414         if (sock->recvcmsgbuf != NULL)
1415                 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
1416                             sock->recvcmsgbuflen);
1417         if (sock->sendcmsgbuf != NULL)
1418                 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
1419                             sock->sendcmsgbuflen);
1420
1421         sock->magic = 0;
1422
1423         DESTROYLOCK(&sock->lock);
1424
1425         isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1426
1427         *socketp = NULL;
1428 }
1429
1430 #ifdef SO_BSDCOMPAT
1431 /*
1432  * This really should not be necessary to do.  Having to workout
1433  * which kernel version we are on at run time so that we don't cause
1434  * the kernel to issue a warning about us using a deprecated socket option.
1435  * Such warnings should *never* be on by default in production kernels.
1436  *
1437  * We can't do this a build time because executables are moved between
1438  * machines and hence kernels.
1439  *
1440  * We can't just not set SO_BSDCOMAT because some kernels require it.
1441  */
1442
1443 static isc_once_t         bsdcompat_once = ISC_ONCE_INIT;
1444 isc_boolean_t bsdcompat = ISC_TRUE;
1445
1446 static void
1447 clear_bsdcompat(void) {
1448 #ifdef __linux__
1449          struct utsname buf;
1450          char *endp;
1451          long int major;
1452          long int minor;
1453
1454          uname(&buf);    /* Can only fail if buf is bad in Linux. */
1455
1456          /* Paranoia in parsing can be increased, but we trust uname(). */
1457          major = strtol(buf.release, &endp, 10);
1458          if (*endp == '.') {
1459                 minor = strtol(endp+1, &endp, 10);
1460                 if ((major > 2) || ((major == 2) && (minor >= 4))) {
1461                         bsdcompat = ISC_FALSE;
1462                 }
1463          }
1464 #endif /* __linux __ */
1465 }
1466 #endif
1467
1468 /*%
1469  * Create a new 'type' socket managed by 'manager'.  Events
1470  * will be posted to 'task' and when dispatched 'action' will be
1471  * called with 'arg' as the arg value.  The new socket is returned
1472  * in 'socketp'.
1473  */
1474 isc_result_t
1475 isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1476                   isc_socket_t **socketp)
1477 {
1478         isc_socket_t *sock = NULL;
1479         isc_result_t result;
1480 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
1481         int on = 1;
1482 #endif
1483 #if defined(SO_RCVBUF)
1484         ISC_SOCKADDR_LEN_T optlen;
1485         int size;
1486 #endif
1487         char strbuf[ISC_STRERRORSIZE];
1488         const char *err = "socket";
1489         int try = 0;
1490
1491         REQUIRE(VALID_MANAGER(manager));
1492         REQUIRE(socketp != NULL && *socketp == NULL);
1493
1494         result = allocate_socket(manager, type, &sock);
1495         if (result != ISC_R_SUCCESS)
1496                 return (result);
1497
1498         sock->pf = pf;
1499  again:
1500         switch (type) {
1501         case isc_sockettype_udp:
1502                 sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1503                 break;
1504         case isc_sockettype_tcp:
1505                 sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1506                 break;
1507         case isc_sockettype_unix:
1508                 sock->fd = socket(pf, SOCK_STREAM, 0);
1509                 break;
1510         }
1511         if (sock->fd == -1 && errno == EINTR && try++ < 42)
1512                 goto again;
1513
1514 #ifdef F_DUPFD
1515         /*
1516          * Leave a space for stdio and TCP to work in.
1517          */
1518         if (manager->reserved != 0 && type == isc_sockettype_udp &&
1519             sock->fd >= 0 && sock->fd < manager->reserved) {
1520                 int new, tmp;
1521                 new = fcntl(sock->fd, F_DUPFD, manager->reserved);
1522                 tmp = errno;
1523                 (void)close(sock->fd);
1524                 errno = tmp;
1525                 sock->fd = new;
1526                 err = "isc_socket_create: fcntl/reserved";
1527         } else if (sock->fd >= 0 && sock->fd < 20) {
1528                 int new, tmp;
1529                 new = fcntl(sock->fd, F_DUPFD, 20);
1530                 tmp = errno;
1531                 (void)close(sock->fd);
1532                 errno = tmp;
1533                 sock->fd = new;
1534                 err = "isc_socket_create: fcntl";
1535         }
1536 #endif
1537
1538         if (sock->fd >= (int)manager->fdsize) {
1539                 (void)close(sock->fd);
1540                 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1541                                ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1542                                isc_msgcat, ISC_MSGSET_SOCKET,
1543                                ISC_MSG_TOOMANYFDS,
1544                                "%s: too many open file descriptors", "socket");
1545                 free_socket(&sock);
1546                 return (ISC_R_NORESOURCES);
1547         }
1548         
1549         if (sock->fd < 0) {
1550                 free_socket(&sock);
1551
1552                 switch (errno) {
1553                 case EMFILE:
1554                 case ENFILE:
1555                 case ENOBUFS:
1556                         return (ISC_R_NORESOURCES);
1557
1558                 case EPROTONOSUPPORT:
1559                 case EPFNOSUPPORT:
1560                 case EAFNOSUPPORT:
1561                 /*
1562                  * Linux 2.2 (and maybe others) return EINVAL instead of
1563                  * EAFNOSUPPORT.
1564                  */
1565                 case EINVAL:
1566                         return (ISC_R_FAMILYNOSUPPORT);
1567
1568                 default:
1569                         isc__strerror(errno, strbuf, sizeof(strbuf));
1570                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1571                                          "%s() %s: %s", err,
1572                                          isc_msgcat_get(isc_msgcat,
1573                                                         ISC_MSGSET_GENERAL,
1574                                                         ISC_MSG_FAILED,
1575                                                         "failed"),
1576                                          strbuf);
1577                         return (ISC_R_UNEXPECTED);
1578                 }
1579         }
1580
1581         if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
1582                 (void)close(sock->fd);
1583                 free_socket(&sock);
1584                 return (ISC_R_UNEXPECTED);
1585         }
1586
1587 #ifdef SO_BSDCOMPAT
1588         RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
1589                                   clear_bsdcompat) == ISC_R_SUCCESS);
1590         if (type != isc_sockettype_unix && bsdcompat &&
1591             setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
1592                        (void *)&on, sizeof(on)) < 0) {
1593                 isc__strerror(errno, strbuf, sizeof(strbuf));
1594                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1595                                  "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
1596                                  sock->fd,
1597                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1598                                                 ISC_MSG_FAILED, "failed"),
1599                                  strbuf);
1600                 /* Press on... */
1601         }
1602 #endif
1603
1604 #if defined(USE_CMSG) || defined(SO_RCVBUF)
1605         if (type == isc_sockettype_udp) {
1606
1607 #if defined(USE_CMSG)
1608 #if defined(SO_TIMESTAMP)
1609                 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1610                                (void *)&on, sizeof(on)) < 0
1611                     && errno != ENOPROTOOPT) {
1612                         isc__strerror(errno, strbuf, sizeof(strbuf));
1613                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1614                                          "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1615                                          sock->fd, 
1616                                          isc_msgcat_get(isc_msgcat,
1617                                                         ISC_MSGSET_GENERAL,
1618                                                         ISC_MSG_FAILED,
1619                                                         "failed"),
1620                                          strbuf);
1621                         /* Press on... */
1622                 }
1623 #endif /* SO_TIMESTAMP */
1624
1625 #if defined(ISC_PLATFORM_HAVEIPV6)
1626                 if (pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
1627                         /*
1628                          * Warn explicitly because this anomaly can be hidden
1629                          * in usual operation (and unexpectedly appear later).
1630                          */
1631                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1632                                          "No buffer available to receive "
1633                                          "IPv6 destination");
1634                 }
1635 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1636 #ifdef IPV6_RECVPKTINFO
1637                 /* RFC 3542 */
1638                 if ((pf == AF_INET6)
1639                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1640                                    (void *)&on, sizeof(on)) < 0)) {
1641                         isc__strerror(errno, strbuf, sizeof(strbuf));
1642                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1643                                          "setsockopt(%d, IPV6_RECVPKTINFO) "
1644                                          "%s: %s", sock->fd,
1645                                          isc_msgcat_get(isc_msgcat,
1646                                                         ISC_MSGSET_GENERAL,
1647                                                         ISC_MSG_FAILED,
1648                                                         "failed"),
1649                                          strbuf);
1650                 }
1651 #else
1652                 /* RFC 2292 */
1653                 if ((pf == AF_INET6)
1654                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1655                                    (void *)&on, sizeof(on)) < 0)) {
1656                         isc__strerror(errno, strbuf, sizeof(strbuf));
1657                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1658                                          "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1659                                          sock->fd,
1660                                          isc_msgcat_get(isc_msgcat,
1661                                                         ISC_MSGSET_GENERAL,
1662                                                         ISC_MSG_FAILED,
1663                                                         "failed"),
1664                                          strbuf);
1665                 }
1666 #endif /* IPV6_RECVPKTINFO */
1667 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
1668 #ifdef IPV6_USE_MIN_MTU        /* RFC 3542, not too common yet*/
1669                 /* use minimum MTU */
1670                 if (pf == AF_INET6) {
1671                         (void)setsockopt(sock->fd, IPPROTO_IPV6,
1672                                          IPV6_USE_MIN_MTU,
1673                                          (void *)&on, sizeof(on));
1674                 }
1675 #endif
1676 #endif /* ISC_PLATFORM_HAVEIPV6 */
1677 #endif /* defined(USE_CMSG) */
1678
1679 #if defined(SO_RCVBUF)
1680                 optlen = sizeof(size);
1681                 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1682                                (void *)&size, &optlen) >= 0 &&
1683                      size < RCVBUFSIZE) {
1684                         size = RCVBUFSIZE;
1685                         if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1686                                        (void *)&size, sizeof(size)) == -1) {
1687                                 isc__strerror(errno, strbuf, sizeof(strbuf));
1688                                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1689                                         "setsockopt(%d, SO_RCVBUF, %d) %s: %s",
1690                                         sock->fd, size,
1691                                         isc_msgcat_get(isc_msgcat,
1692                                                        ISC_MSGSET_GENERAL,
1693                                                        ISC_MSG_FAILED,
1694                                                        "failed"),
1695                                         strbuf);
1696                         }
1697                 }
1698 #endif
1699         }
1700 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
1701
1702         sock->references = 1;
1703         *socketp = sock;
1704
1705         LOCK(&manager->lock);
1706
1707         /*
1708          * Note we don't have to lock the socket like we normally would because
1709          * there are no external references to it yet.
1710          */
1711
1712         manager->fds[sock->fd] = sock;
1713         manager->fdstate[sock->fd] = MANAGED;
1714         ISC_LIST_APPEND(manager->socklist, sock, link);
1715         if (manager->maxfd < sock->fd)
1716                 manager->maxfd = sock->fd;
1717
1718         UNLOCK(&manager->lock);
1719
1720         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1721                    ISC_MSG_CREATED, "created");
1722
1723         return (ISC_R_SUCCESS);
1724 }
1725
1726 /*
1727  * Attach to a socket.  Caller must explicitly detach when it is done.
1728  */
1729 void
1730 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1731         REQUIRE(VALID_SOCKET(sock));
1732         REQUIRE(socketp != NULL && *socketp == NULL);
1733
1734         LOCK(&sock->lock);
1735         sock->references++;
1736         UNLOCK(&sock->lock);
1737
1738         *socketp = sock;
1739 }
1740
1741 /*
1742  * Dereference a socket.  If this is the last reference to it, clean things
1743  * up by destroying the socket.
1744  */
1745 void
1746 isc_socket_detach(isc_socket_t **socketp) {
1747         isc_socket_t *sock;
1748         isc_boolean_t kill_socket = ISC_FALSE;
1749
1750         REQUIRE(socketp != NULL);
1751         sock = *socketp;
1752         REQUIRE(VALID_SOCKET(sock));
1753
1754         LOCK(&sock->lock);
1755         REQUIRE(sock->references > 0);
1756         sock->references--;
1757         if (sock->references == 0)
1758                 kill_socket = ISC_TRUE;
1759         UNLOCK(&sock->lock);
1760
1761         if (kill_socket)
1762                 destroy(&sock);
1763
1764         *socketp = NULL;
1765 }
1766
1767 /*
1768  * I/O is possible on a given socket.  Schedule an event to this task that
1769  * will call an internal function to do the I/O.  This will charge the
1770  * task with the I/O operation and let our select loop handler get back
1771  * to doing something real as fast as possible.
1772  *
1773  * The socket and manager must be locked before calling this function.
1774  */
1775 static void
1776 dispatch_recv(isc_socket_t *sock) {
1777         intev_t *iev;
1778         isc_socketevent_t *ev;
1779
1780         INSIST(!sock->pending_recv);
1781
1782         ev = ISC_LIST_HEAD(sock->recv_list);
1783         if (ev == NULL)
1784                 return;
1785
1786         sock->pending_recv = 1;
1787         iev = &sock->readable_ev;
1788
1789         socket_log(sock, NULL, EVENT, NULL, 0, 0,
1790                    "dispatch_recv:  event %p -> task %p", ev, ev->ev_sender);
1791
1792         sock->references++;
1793         iev->ev_sender = sock;
1794         iev->ev_action = internal_recv;
1795         iev->ev_arg = sock;
1796
1797         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1798 }
1799
1800 static void
1801 dispatch_send(isc_socket_t *sock) {
1802         intev_t *iev;
1803         isc_socketevent_t *ev;
1804
1805         INSIST(!sock->pending_send);
1806
1807         ev = ISC_LIST_HEAD(sock->send_list);
1808         if (ev == NULL)
1809                 return;
1810
1811         sock->pending_send = 1;
1812         iev = &sock->writable_ev;
1813
1814         socket_log(sock, NULL, EVENT, NULL, 0, 0,
1815                    "dispatch_send:  event %p -> task %p", ev, ev->ev_sender);
1816
1817         sock->references++;
1818         iev->ev_sender = sock;
1819         iev->ev_action = internal_send;
1820         iev->ev_arg = sock;
1821
1822         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1823 }
1824
1825 /*
1826  * Dispatch an internal accept event.
1827  */
1828 static void
1829 dispatch_accept(isc_socket_t *sock) {
1830         intev_t *iev;
1831         isc_socket_newconnev_t *ev;
1832
1833         INSIST(!sock->pending_accept);
1834
1835         /*
1836          * Are there any done events left, or were they all canceled
1837          * before the manager got the socket lock?
1838          */
1839         ev = ISC_LIST_HEAD(sock->accept_list);
1840         if (ev == NULL)
1841                 return;
1842
1843         sock->pending_accept = 1;
1844         iev = &sock->readable_ev;
1845
1846         sock->references++;  /* keep socket around for this internal event */
1847         iev->ev_sender = sock;
1848         iev->ev_action = internal_accept;
1849         iev->ev_arg = sock;
1850
1851         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1852 }
1853
1854 static void
1855 dispatch_connect(isc_socket_t *sock) {
1856         intev_t *iev;
1857         isc_socket_connev_t *ev;
1858
1859         iev = &sock->writable_ev;
1860
1861         ev = sock->connect_ev;
1862         INSIST(ev != NULL); /* XXX */
1863
1864         INSIST(sock->connecting);
1865
1866         sock->references++;  /* keep socket around for this internal event */
1867         iev->ev_sender = sock;
1868         iev->ev_action = internal_connect;
1869         iev->ev_arg = sock;
1870
1871         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1872 }
1873
1874 /*
1875  * Dequeue an item off the given socket's read queue, set the result code
1876  * in the done event to the one provided, and send it to the task it was
1877  * destined for.
1878  *
1879  * If the event to be sent is on a list, remove it before sending.  If
1880  * asked to, send and detach from the socket as well.
1881  *
1882  * Caller must have the socket locked if the event is attached to the socket.
1883  */
1884 static void
1885 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1886         isc_task_t *task;
1887
1888         task = (*dev)->ev_sender;
1889
1890         (*dev)->ev_sender = sock;
1891
1892         if (ISC_LINK_LINKED(*dev, ev_link))
1893                 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1894
1895         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1896             == ISC_SOCKEVENTATTR_ATTACHED)
1897                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1898         else
1899                 isc_task_send(task, (isc_event_t **)dev);
1900 }
1901
1902 /*
1903  * See comments for send_recvdone_event() above.
1904  *
1905  * Caller must have the socket locked if the event is attached to the socket.
1906  */
1907 static void
1908 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1909         isc_task_t *task;
1910
1911         INSIST(dev != NULL && *dev != NULL);
1912
1913         task = (*dev)->ev_sender;
1914         (*dev)->ev_sender = sock;
1915
1916         if (ISC_LINK_LINKED(*dev, ev_link))
1917                 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1918
1919         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1920             == ISC_SOCKEVENTATTR_ATTACHED)
1921                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1922         else
1923                 isc_task_send(task, (isc_event_t **)dev);
1924 }
1925
1926 /*
1927  * Call accept() on a socket, to get the new file descriptor.  The listen
1928  * socket is used as a prototype to create a new isc_socket_t.  The new
1929  * socket has one outstanding reference.  The task receiving the event
1930  * will be detached from just after the event is delivered.
1931  *
1932  * On entry to this function, the event delivered is the internal
1933  * readable event, and the first item on the accept_list should be
1934  * the done event we want to send.  If the list is empty, this is a no-op,
1935  * so just unlock and return.
1936  */
1937 static void
1938 internal_accept(isc_task_t *me, isc_event_t *ev) {
1939         isc_socket_t *sock;
1940         isc_socketmgr_t *manager;
1941         isc_socket_newconnev_t *dev;
1942         isc_task_t *task;
1943         ISC_SOCKADDR_LEN_T addrlen;
1944         int fd;
1945         isc_result_t result = ISC_R_SUCCESS;
1946         char strbuf[ISC_STRERRORSIZE];
1947         const char *err = "accept";
1948
1949         UNUSED(me);
1950
1951         sock = ev->ev_sender;
1952         INSIST(VALID_SOCKET(sock));
1953
1954         LOCK(&sock->lock);
1955         socket_log(sock, NULL, TRACE,
1956                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1957                    "internal_accept called, locked socket");
1958
1959         manager = sock->manager;
1960         INSIST(VALID_MANAGER(manager));
1961
1962         INSIST(sock->listener);
1963         INSIST(sock->pending_accept == 1);
1964         sock->pending_accept = 0;
1965
1966         INSIST(sock->references > 0);
1967         sock->references--;  /* the internal event is done with this socket */
1968         if (sock->references == 0) {
1969                 UNLOCK(&sock->lock);
1970                 destroy(&sock);
1971                 return;
1972         }
1973
1974         /*
1975          * Get the first item off the accept list.
1976          * If it is empty, unlock the socket and return.
1977          */
1978         dev = ISC_LIST_HEAD(sock->accept_list);
1979         if (dev == NULL) {
1980                 UNLOCK(&sock->lock);
1981                 return;
1982         }
1983
1984         /*
1985          * Try to accept the new connection.  If the accept fails with
1986          * EAGAIN or EINTR, simply poke the watcher to watch this socket
1987          * again.  Also ignore ECONNRESET, which has been reported to
1988          * be spuriously returned on Linux 2.2.19 although it is not
1989          * a documented error for accept().  ECONNABORTED has been
1990          * reported for Solaris 8.  The rest are thrown in not because
1991          * we have seen them but because they are ignored by other
1992          * deamons such as BIND 8 and Apache.
1993          */
1994
1995         addrlen = sizeof(dev->newsocket->address.type);
1996         memset(&dev->newsocket->address.type.sa, 0, addrlen);
1997         fd = accept(sock->fd, &dev->newsocket->address.type.sa,
1998                     (void *)&addrlen);
1999
2000 #ifdef F_DUPFD
2001         /*
2002          * Leave a space for stdio to work in.
2003          */
2004         if (fd >= 0 && fd < 20) {
2005                 int new, tmp;
2006                 new = fcntl(fd, F_DUPFD, 20);
2007                 tmp = errno;
2008                 (void)close(fd);
2009                 errno = tmp;
2010                 fd = new;
2011                 err = "accept/fcntl";
2012         }
2013 #endif
2014
2015         if (fd < 0) {
2016                 if (SOFT_ERROR(errno))
2017                         goto soft_error;
2018                 switch (errno) {
2019                 case ENFILE:
2020                 case EMFILE:
2021                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2022                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2023                                        isc_msgcat, ISC_MSGSET_SOCKET,
2024                                        ISC_MSG_TOOMANYFDS,
2025                                        "%s: too many open file descriptors",
2026                                        err);
2027                         goto soft_error;
2028
2029                 case ENOBUFS:
2030                 case ENOMEM:
2031                 case ECONNRESET:
2032                 case ECONNABORTED:
2033                 case EHOSTUNREACH:
2034                 case EHOSTDOWN:
2035                 case ENETUNREACH:
2036                 case ENETDOWN:
2037                 case ECONNREFUSED:
2038 #ifdef EPROTO
2039                 case EPROTO:
2040 #endif
2041 #ifdef ENONET
2042                 case ENONET:
2043 #endif
2044                         goto soft_error;
2045                 default:
2046                         break;
2047                 }
2048                 isc__strerror(errno, strbuf, sizeof(strbuf));
2049                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2050                                  "internal_accept: %s() %s: %s", err,
2051                                  isc_msgcat_get(isc_msgcat,
2052                                                 ISC_MSGSET_GENERAL,
2053                                                 ISC_MSG_FAILED,
2054                                                 "failed"),
2055                                  strbuf);
2056                 fd = -1;
2057                 result = ISC_R_UNEXPECTED;
2058         } else {
2059                 if (addrlen == 0U) {
2060                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2061                                          "internal_accept(): "
2062                                          "accept() failed to return "
2063                                          "remote address");
2064
2065                         (void)close(fd);
2066                         goto soft_error;
2067                 } else if (dev->newsocket->address.type.sa.sa_family !=
2068                            sock->pf)
2069                 {
2070                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2071                                          "internal_accept(): "
2072                                          "accept() returned peer address "
2073                                          "family %u (expected %u)", 
2074                                          dev->newsocket->address.
2075                                          type.sa.sa_family,
2076                                          sock->pf);
2077                         (void)close(fd);
2078                         goto soft_error;
2079                 } else if (fd >= (int)manager->fdsize) {
2080                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2081                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2082                                        isc_msgcat, ISC_MSGSET_SOCKET,
2083                                        ISC_MSG_TOOMANYFDS,
2084                                        "%s: too many open file descriptors",
2085                                        "accept");
2086                         (void)close(fd);
2087                         goto soft_error;
2088                 }
2089         }
2090
2091         if (fd != -1) {
2092                 dev->newsocket->address.length = addrlen;
2093                 dev->newsocket->pf = sock->pf;
2094         }
2095
2096         /*
2097          * Pull off the done event.
2098          */
2099         ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2100
2101         /*
2102          * Poke watcher if there are more pending accepts.
2103          */
2104         if (!ISC_LIST_EMPTY(sock->accept_list))
2105                 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2106
2107         UNLOCK(&sock->lock);
2108
2109         if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) {
2110                 (void)close(fd);
2111                 fd = -1;
2112                 result = ISC_R_UNEXPECTED;
2113         }
2114
2115         /*
2116          * -1 means the new socket didn't happen.
2117          */
2118         if (fd != -1) {
2119                 LOCK(&manager->lock);
2120                 ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
2121
2122                 dev->newsocket->fd = fd;
2123                 dev->newsocket->bound = 1;
2124                 dev->newsocket->connected = 1;
2125
2126                 /*
2127                  * Save away the remote address
2128                  */
2129                 dev->address = dev->newsocket->address;
2130
2131                 manager->fds[fd] = dev->newsocket;
2132                 manager->fdstate[fd] = MANAGED;
2133                 if (manager->maxfd < fd)
2134                         manager->maxfd = fd;
2135
2136                 socket_log(sock, &dev->newsocket->address, CREATION,
2137                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2138                            "accepted connection, new socket %p",
2139                            dev->newsocket);
2140
2141                 UNLOCK(&manager->lock);
2142         } else {
2143                 dev->newsocket->references--;
2144                 free_socket(&dev->newsocket);
2145         }
2146         
2147         /*
2148          * Fill in the done event details and send it off.
2149          */
2150         dev->result = result;
2151         task = dev->ev_sender;
2152         dev->ev_sender = sock;
2153
2154         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2155         return;
2156
2157  soft_error:
2158         select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2159         UNLOCK(&sock->lock);
2160         return;
2161 }
2162
2163 static void
2164 internal_recv(isc_task_t *me, isc_event_t *ev) {
2165         isc_socketevent_t *dev;
2166         isc_socket_t *sock;
2167
2168         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
2169
2170         sock = ev->ev_sender;
2171         INSIST(VALID_SOCKET(sock));
2172
2173         LOCK(&sock->lock);
2174         socket_log(sock, NULL, IOEVENT,
2175                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2176                    "internal_recv: task %p got event %p", me, ev);
2177
2178         INSIST(sock->pending_recv == 1);
2179         sock->pending_recv = 0;
2180
2181         INSIST(sock->references > 0);
2182         sock->references--;  /* the internal event is done with this socket */
2183         if (sock->references == 0) {
2184                 UNLOCK(&sock->lock);
2185                 destroy(&sock);
2186                 return;
2187         }
2188
2189         /*
2190          * Try to do as much I/O as possible on this socket.  There are no
2191          * limits here, currently.
2192          */
2193         dev = ISC_LIST_HEAD(sock->recv_list);
2194         while (dev != NULL) {
2195                 switch (doio_recv(sock, dev)) {
2196                 case DOIO_SOFT:
2197                         goto poke;
2198
2199                 case DOIO_EOF:
2200                         /*
2201                          * read of 0 means the remote end was closed.
2202                          * Run through the event queue and dispatch all
2203                          * the events with an EOF result code.
2204                          */
2205                         do {
2206                                 dev->result = ISC_R_EOF;
2207                                 send_recvdone_event(sock, &dev);
2208                                 dev = ISC_LIST_HEAD(sock->recv_list);
2209                         } while (dev != NULL);
2210                         goto poke;
2211
2212                 case DOIO_SUCCESS:
2213                 case DOIO_HARD:
2214                         send_recvdone_event(sock, &dev);
2215                         break;
2216                 }
2217
2218                 dev = ISC_LIST_HEAD(sock->recv_list);
2219         }
2220
2221  poke:
2222         if (!ISC_LIST_EMPTY(sock->recv_list))
2223                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2224
2225         UNLOCK(&sock->lock);
2226 }
2227
2228 static void
2229 internal_send(isc_task_t *me, isc_event_t *ev) {
2230         isc_socketevent_t *dev;
2231         isc_socket_t *sock;
2232
2233         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2234
2235         /*
2236          * Find out what socket this is and lock it.
2237          */
2238         sock = (isc_socket_t *)ev->ev_sender;
2239         INSIST(VALID_SOCKET(sock));
2240
2241         LOCK(&sock->lock);
2242         socket_log(sock, NULL, IOEVENT,
2243                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2244                    "internal_send: task %p got event %p", me, ev);
2245
2246         INSIST(sock->pending_send == 1);
2247         sock->pending_send = 0;
2248
2249         INSIST(sock->references > 0);
2250         sock->references--;  /* the internal event is done with this socket */
2251         if (sock->references == 0) {
2252                 UNLOCK(&sock->lock);
2253                 destroy(&sock);
2254                 return;
2255         }
2256
2257         /*
2258          * Try to do as much I/O as possible on this socket.  There are no
2259          * limits here, currently.
2260          */
2261         dev = ISC_LIST_HEAD(sock->send_list);
2262         while (dev != NULL) {
2263                 switch (doio_send(sock, dev)) {
2264                 case DOIO_SOFT:
2265                         goto poke;
2266
2267                 case DOIO_HARD:
2268                 case DOIO_SUCCESS:
2269                         send_senddone_event(sock, &dev);
2270                         break;
2271                 }
2272
2273                 dev = ISC_LIST_HEAD(sock->send_list);
2274         }
2275
2276  poke:
2277         if (!ISC_LIST_EMPTY(sock->send_list))
2278                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2279
2280         UNLOCK(&sock->lock);
2281 }
2282
2283 static void
2284 process_fds(isc_socketmgr_t *manager, int maxfd,
2285             fd_set *readfds, fd_set *writefds)
2286 {
2287         int i;
2288         isc_socket_t *sock;
2289         isc_boolean_t unlock_sock;
2290
2291         REQUIRE(maxfd <= (int)manager->fdsize);
2292
2293         /*
2294          * Process read/writes on other fds here.  Avoid locking
2295          * and unlocking twice if both reads and writes are possible.
2296          */
2297         for (i = 0; i < maxfd; i++) {
2298 #ifdef ISC_PLATFORM_USETHREADS
2299                 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
2300                         continue;
2301 #endif /* ISC_PLATFORM_USETHREADS */
2302
2303                 if (manager->fdstate[i] == CLOSE_PENDING) {
2304                         manager->fdstate[i] = CLOSED;
2305                         FD_CLR(i, manager->read_fds);
2306                         FD_CLR(i, manager->write_fds);
2307
2308                         (void)close(i);
2309
2310                         continue;
2311                 }
2312
2313                 sock = manager->fds[i];
2314                 unlock_sock = ISC_FALSE;
2315                 if (FD_ISSET(i, readfds)) {
2316                         if (sock == NULL) {
2317                                 FD_CLR(i, manager->read_fds);
2318                                 goto check_write;
2319                         }
2320                         unlock_sock = ISC_TRUE;
2321                         LOCK(&sock->lock);
2322                         if (!SOCK_DEAD(sock)) {
2323                                 if (sock->listener)
2324                                         dispatch_accept(sock);
2325                                 else
2326                                         dispatch_recv(sock);
2327                         }
2328                         FD_CLR(i, manager->read_fds);
2329                 }
2330         check_write:
2331                 if (FD_ISSET(i, writefds)) {
2332                         if (sock == NULL) {
2333                                 FD_CLR(i, manager->write_fds);
2334                                 continue;
2335                         }
2336                         if (!unlock_sock) {
2337                                 unlock_sock = ISC_TRUE;
2338                                 LOCK(&sock->lock);
2339                         }
2340                         if (!SOCK_DEAD(sock)) {
2341                                 if (sock->connecting)
2342                                         dispatch_connect(sock);
2343                                 else
2344                                         dispatch_send(sock);
2345                         }
2346                         FD_CLR(i, manager->write_fds);
2347                 }
2348                 if (unlock_sock)
2349                         UNLOCK(&sock->lock);
2350         }
2351 }
2352
2353 #ifdef ISC_PLATFORM_USETHREADS
2354 /*
2355  * This is the thread that will loop forever, always in a select or poll
2356  * call.
2357  *
2358  * When select returns something to do, track down what thread gets to do
2359  * this I/O and post the event to it.
2360  */
2361 static isc_threadresult_t
2362 watcher(void *uap) {
2363         isc_socketmgr_t *manager = uap;
2364         isc_boolean_t done;
2365         int ctlfd;
2366         int cc;
2367         int msg, fd;
2368         int maxfd;
2369         char strbuf[ISC_STRERRORSIZE];
2370
2371         /*
2372          * Get the control fd here.  This will never change.
2373          */
2374         LOCK(&manager->lock);
2375         ctlfd = manager->pipe_fds[0];
2376
2377         done = ISC_FALSE;
2378         while (!done) {
2379                 do {
2380                         memcpy(manager->read_fds_copy, manager->read_fds,
2381                                manager->fd_bufsize);
2382                         memcpy(manager->write_fds_copy, manager->write_fds,
2383                                manager->fd_bufsize);
2384                         maxfd = manager->maxfd + 1;
2385
2386                         UNLOCK(&manager->lock);
2387
2388                         cc = select(maxfd, manager->read_fds_copy,
2389                                     manager->write_fds_copy, NULL, NULL);
2390                         if (cc < 0) {
2391                                 if (!SOFT_ERROR(errno)) {
2392                                         isc__strerror(errno, strbuf,
2393                                                       sizeof(strbuf));
2394                                         FATAL_ERROR(__FILE__, __LINE__,
2395                                                     "select() %s: %s",
2396                                                     isc_msgcat_get(isc_msgcat,
2397                                                             ISC_MSGSET_GENERAL,
2398                                                             ISC_MSG_FAILED,
2399                                                             "failed"),
2400                                                     strbuf);
2401                                 }
2402                         }
2403
2404                         LOCK(&manager->lock);
2405                 } while (cc < 0);
2406
2407
2408                 /*
2409                  * Process reads on internal, control fd.
2410                  */
2411                 if (FD_ISSET(ctlfd, manager->read_fds_copy)) {
2412                         for (;;) {
2413                                 select_readmsg(manager, &fd, &msg);
2414
2415                                 manager_log(manager, IOEVENT,
2416                                             isc_msgcat_get(isc_msgcat,
2417                                                      ISC_MSGSET_SOCKET,
2418                                                      ISC_MSG_WATCHERMSG,
2419                                                      "watcher got message %d"),
2420                                                      msg);
2421
2422                                 /*
2423                                  * Nothing to read?
2424                                  */
2425                                 if (msg == SELECT_POKE_NOTHING)
2426                                         break;
2427
2428                                 /*
2429                                  * Handle shutdown message.  We really should
2430                                  * jump out of this loop right away, but
2431                                  * it doesn't matter if we have to do a little
2432                                  * more work first.
2433                                  */
2434                                 if (msg == SELECT_POKE_SHUTDOWN) {
2435                                         done = ISC_TRUE;
2436
2437                                         break;
2438                                 }
2439
2440                                 /*
2441                                  * This is a wakeup on a socket.  Look
2442                                  * at the event queue for both read and write,
2443                                  * and decide if we need to watch on it now
2444                                  * or not.
2445                                  */
2446                                 wakeup_socket(manager, fd, msg);
2447                         }
2448                 }
2449
2450                 process_fds(manager, maxfd, manager->read_fds_copy,
2451                             manager->write_fds_copy);
2452         }
2453
2454         manager_log(manager, TRACE,
2455                     isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2456                                    ISC_MSG_EXITING, "watcher exiting"));
2457
2458         UNLOCK(&manager->lock);
2459         return ((isc_threadresult_t)0);
2460 }
2461 #endif /* ISC_PLATFORM_USETHREADS */
2462
2463 void
2464 isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
2465
2466         REQUIRE(VALID_MANAGER(manager));
2467
2468         manager->reserved = reserved;
2469 }
2470
2471 /*
2472  * Initialize fdsets in socketmgr structure.
2473  */
2474 static isc_result_t
2475 create_fdsets(isc_socketmgr_t *manager, isc_mem_t *mctx) {
2476 #if ISC_SOCKET_FDSETSIZE > FD_SETSIZE
2477         manager->fdsize = ISC_SOCKET_FDSETSIZE;
2478         manager->fd_bufsize = howmany(ISC_SOCKET_FDSETSIZE, NFDBITS) *
2479                 sizeof(fd_mask);
2480 #else
2481         manager->fdsize = FD_SETSIZE;
2482         manager->fd_bufsize = sizeof(fd_set);
2483 #endif
2484
2485         manager->fds = NULL;
2486         manager->fdstate = NULL;
2487         manager->read_fds = NULL;
2488         manager->read_fds_copy = NULL;
2489         manager->write_fds = NULL;
2490         manager->write_fds_copy = NULL;
2491
2492         manager->fds = isc_mem_get(mctx,
2493                                    manager->fdsize * sizeof(manager->fds[0]));
2494         if (manager->fds == NULL)
2495                 goto fail;
2496
2497         manager->fdstate = isc_mem_get(mctx, manager->fdsize *
2498                                        sizeof(manager->fdstate[0]));
2499         if (manager->fdstate == NULL)
2500                 goto fail;
2501
2502         manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
2503         if (manager->read_fds == NULL)
2504                 goto fail;
2505         manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
2506         if (manager->read_fds_copy == NULL)
2507                 goto fail;
2508         manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
2509         if (manager->write_fds == NULL)
2510                 goto fail;
2511         manager->write_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
2512         if (manager->write_fds_copy == NULL)
2513                 goto fail;
2514
2515         return (ISC_R_SUCCESS);
2516
2517   fail:
2518         cleanup_fdsets(manager, mctx);
2519         return (ISC_R_NOMEMORY);
2520 }
2521
2522 /*
2523  * Clean up fdsets in socketmgr structure.
2524  */
2525 static void
2526 cleanup_fdsets(isc_socketmgr_t *manager, isc_mem_t *mctx) {
2527         if (manager->fds != NULL) {
2528                 isc_mem_put(mctx, manager->fds,
2529                             manager->fdsize * sizeof(manager->fds[0]));
2530         }
2531         if (manager->fdstate != NULL) {
2532                 isc_mem_put(mctx, manager->fdstate,
2533                             manager->fdsize * sizeof(manager->fdstate[0]));
2534         }
2535         if (manager->read_fds != NULL)
2536                 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
2537         if (manager->read_fds_copy != NULL)
2538                 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize);
2539         if (manager->write_fds != NULL)
2540                 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
2541         if (manager->write_fds_copy != NULL)
2542                 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
2543 }
2544
2545 /*
2546  * Create a new socket manager.
2547  */
2548 isc_result_t
2549 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2550         isc_socketmgr_t *manager;
2551 #ifdef ISC_PLATFORM_USETHREADS
2552         char strbuf[ISC_STRERRORSIZE];
2553 #endif
2554         isc_result_t result;
2555
2556         REQUIRE(managerp != NULL && *managerp == NULL);
2557
2558 #ifndef ISC_PLATFORM_USETHREADS
2559         if (socketmgr != NULL) {
2560                 socketmgr->refs++;
2561                 *managerp = socketmgr;
2562                 return (ISC_R_SUCCESS);
2563         }
2564 #endif /* ISC_PLATFORM_USETHREADS */
2565
2566         manager = isc_mem_get(mctx, sizeof(*manager));
2567         if (manager == NULL)
2568                 return (ISC_R_NOMEMORY);
2569
2570         result = create_fdsets(manager, mctx);
2571         if (result != ISC_R_SUCCESS) {
2572                 cleanup_fdsets(manager, mctx);
2573                 isc_mem_put(mctx, manager, sizeof(*manager));
2574                 return (result);
2575         }
2576
2577         manager->magic = SOCKET_MANAGER_MAGIC;
2578         manager->mctx = NULL;
2579         memset(manager->fds, 0, sizeof(manager->fds[0]) * manager->fdsize);
2580         ISC_LIST_INIT(manager->socklist);
2581         result = isc_mutex_init(&manager->lock);
2582         if (result != ISC_R_SUCCESS) {
2583                 cleanup_fdsets(manager, mctx);
2584                 isc_mem_put(mctx, manager, sizeof(*manager));
2585                 return (result);
2586         }
2587 #ifdef ISC_PLATFORM_USETHREADS
2588         if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2589                 cleanup_fdsets(manager, mctx);
2590                 DESTROYLOCK(&manager->lock);
2591                 isc_mem_put(mctx, manager, sizeof(*manager));
2592                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2593                                  "isc_condition_init() %s",
2594                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2595                                                 ISC_MSG_FAILED, "failed"));
2596                 return (ISC_R_UNEXPECTED);
2597         }
2598
2599         /*
2600          * Create the special fds that will be used to wake up the
2601          * select/poll loop when something internal needs to be done.
2602          */
2603         if (pipe(manager->pipe_fds) != 0) {
2604                 cleanup_fdsets(manager, mctx);
2605                 DESTROYLOCK(&manager->lock);
2606                 isc_mem_put(mctx, manager, sizeof(*manager));
2607                 isc__strerror(errno, strbuf, sizeof(strbuf));
2608                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2609                                  "pipe() %s: %s",
2610                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2611                                                 ISC_MSG_FAILED, "failed"),
2612                                  strbuf);
2613
2614                 return (ISC_R_UNEXPECTED);
2615         }
2616
2617         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
2618 #if 0
2619         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
2620 #endif
2621 #else /* ISC_PLATFORM_USETHREADS */
2622         manager->refs = 1;
2623 #endif /* ISC_PLATFORM_USETHREADS */
2624
2625         /*
2626          * Set up initial state for the select loop
2627          */
2628         memset(manager->read_fds, 0, manager->fd_bufsize);
2629         memset(manager->write_fds, 0, manager->fd_bufsize);
2630 #ifdef ISC_PLATFORM_USETHREADS
2631         FD_SET(manager->pipe_fds[0], manager->read_fds);
2632         manager->maxfd = manager->pipe_fds[0];
2633 #else /* ISC_PLATFORM_USETHREADS */
2634         manager->maxfd = 0;
2635 #endif /* ISC_PLATFORM_USETHREADS */
2636         manager->reserved = 0;
2637         memset(manager->fdstate, 0,
2638                manager->fdsize * sizeof(manager->fdstate[0]));
2639
2640 #ifdef ISC_PLATFORM_USETHREADS
2641         /*
2642          * Start up the select/poll thread.
2643          */
2644         if (isc_thread_create(watcher, manager, &manager->watcher) !=
2645             ISC_R_SUCCESS) {
2646                 (void)close(manager->pipe_fds[0]);
2647                 (void)close(manager->pipe_fds[1]);
2648                 DESTROYLOCK(&manager->lock);
2649                 isc_mem_put(mctx, manager, sizeof(*manager));
2650                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2651                                  "isc_thread_create() %s",
2652                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2653                                                 ISC_MSG_FAILED, "failed"));
2654                 return (ISC_R_UNEXPECTED);
2655         }
2656 #endif /* ISC_PLATFORM_USETHREADS */
2657         isc_mem_attach(mctx, &manager->mctx);
2658
2659 #ifndef ISC_PLATFORM_USETHREADS
2660         socketmgr = manager;
2661 #endif /* ISC_PLATFORM_USETHREADS */
2662         *managerp = manager;
2663
2664         return (ISC_R_SUCCESS);
2665 }
2666
2667 void
2668 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
2669         isc_socketmgr_t *manager;
2670         int i;
2671         isc_mem_t *mctx;
2672
2673         /*
2674          * Destroy a socket manager.
2675          */
2676
2677         REQUIRE(managerp != NULL);
2678         manager = *managerp;
2679         REQUIRE(VALID_MANAGER(manager));
2680
2681 #ifndef ISC_PLATFORM_USETHREADS
2682         if (manager->refs > 1) {
2683                 manager->refs--;
2684                 *managerp = NULL;
2685                 return;
2686         }
2687 #endif /* ISC_PLATFORM_USETHREADS */
2688
2689         LOCK(&manager->lock);
2690
2691 #ifdef ISC_PLATFORM_USETHREADS
2692         /*
2693          * Wait for all sockets to be destroyed.
2694          */
2695         while (!ISC_LIST_EMPTY(manager->socklist)) {
2696                 manager_log(manager, CREATION,
2697                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2698                                            ISC_MSG_SOCKETSREMAIN,
2699                                            "sockets exist"));
2700                 WAIT(&manager->shutdown_ok, &manager->lock);
2701         }
2702 #else /* ISC_PLATFORM_USETHREADS */
2703         /*
2704          * Hope all sockets have been destroyed.
2705          */
2706         if (!ISC_LIST_EMPTY(manager->socklist)) {
2707                 manager_log(manager, CREATION,
2708                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2709                                            ISC_MSG_SOCKETSREMAIN,
2710                                            "sockets exist"));
2711                 INSIST(0);
2712         }
2713 #endif /* ISC_PLATFORM_USETHREADS */
2714
2715         UNLOCK(&manager->lock);
2716
2717         /*
2718          * Here, poke our select/poll thread.  Do this by closing the write
2719          * half of the pipe, which will send EOF to the read half.
2720          * This is currently a no-op in the non-threaded case.
2721          */
2722         select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
2723
2724 #ifdef ISC_PLATFORM_USETHREADS
2725         /*
2726          * Wait for thread to exit.
2727          */
2728         if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
2729                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2730                                  "isc_thread_join() %s",
2731                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2732                                                 ISC_MSG_FAILED, "failed"));
2733 #endif /* ISC_PLATFORM_USETHREADS */
2734
2735         /*
2736          * Clean up.
2737          */
2738 #ifdef ISC_PLATFORM_USETHREADS
2739         (void)close(manager->pipe_fds[0]);
2740         (void)close(manager->pipe_fds[1]);
2741         (void)isc_condition_destroy(&manager->shutdown_ok);
2742 #endif /* ISC_PLATFORM_USETHREADS */
2743
2744         for (i = 0; i < (int)manager->fdsize; i++)
2745                 if (manager->fdstate[i] == CLOSE_PENDING)
2746                         (void)close(i);
2747
2748         DESTROYLOCK(&manager->lock);
2749         cleanup_fdsets(manager, manager->mctx);
2750         manager->magic = 0;
2751         mctx= manager->mctx;
2752         isc_mem_put(mctx, manager, sizeof(*manager));
2753
2754         isc_mem_detach(&mctx);
2755
2756         *managerp = NULL;
2757 }
2758
2759 static isc_result_t
2760 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2761             unsigned int flags)
2762 {
2763         int io_state;
2764         isc_boolean_t have_lock = ISC_FALSE;
2765         isc_task_t *ntask = NULL;
2766         isc_result_t result = ISC_R_SUCCESS;
2767
2768         dev->ev_sender = task;
2769
2770         if (sock->type == isc_sockettype_udp) {
2771                 io_state = doio_recv(sock, dev);
2772         } else {
2773                 LOCK(&sock->lock);
2774                 have_lock = ISC_TRUE;
2775
2776                 if (ISC_LIST_EMPTY(sock->recv_list))
2777                         io_state = doio_recv(sock, dev);
2778                 else
2779                         io_state = DOIO_SOFT;
2780         }
2781
2782         switch (io_state) {
2783         case DOIO_SOFT:
2784                 /*
2785                  * We couldn't read all or part of the request right now, so
2786                  * queue it.
2787                  *
2788                  * Attach to socket and to task
2789                  */
2790                 isc_task_attach(task, &ntask);
2791                 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2792
2793                 if (!have_lock) {
2794                         LOCK(&sock->lock);
2795                         have_lock = ISC_TRUE;
2796                 }
2797
2798                 /*
2799                  * Enqueue the request.  If the socket was previously not being
2800                  * watched, poke the watcher to start paying attention to it.
2801                  */
2802                 if (ISC_LIST_EMPTY(sock->recv_list))
2803                         select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2804                 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2805
2806                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2807                            "socket_recv: event %p -> task %p",
2808                            dev, ntask);
2809
2810                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2811                         result = ISC_R_INPROGRESS;
2812                 break;
2813
2814         case DOIO_EOF:
2815                 dev->result = ISC_R_EOF;
2816                 /* fallthrough */
2817
2818         case DOIO_HARD:
2819         case DOIO_SUCCESS:
2820                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2821                         send_recvdone_event(sock, &dev);
2822                 break;
2823         }
2824
2825         if (have_lock)
2826                 UNLOCK(&sock->lock);
2827
2828         return (result);
2829 }
2830
2831 isc_result_t
2832 isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2833                  unsigned int minimum, isc_task_t *task,
2834                  isc_taskaction_t action, const void *arg)
2835 {
2836         isc_socketevent_t *dev;
2837         isc_socketmgr_t *manager;
2838         unsigned int iocount;
2839         isc_buffer_t *buffer;
2840
2841         REQUIRE(VALID_SOCKET(sock));
2842         REQUIRE(buflist != NULL);
2843         REQUIRE(!ISC_LIST_EMPTY(*buflist));
2844         REQUIRE(task != NULL);
2845         REQUIRE(action != NULL);
2846
2847         manager = sock->manager;
2848         REQUIRE(VALID_MANAGER(manager));
2849
2850         iocount = isc_bufferlist_availablecount(buflist);
2851         REQUIRE(iocount > 0);
2852
2853         INSIST(sock->bound);
2854
2855         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2856         if (dev == NULL) {
2857                 return (ISC_R_NOMEMORY);
2858         }
2859
2860         /*
2861          * UDP sockets are always partial read
2862          */
2863         if (sock->type == isc_sockettype_udp)
2864                 dev->minimum = 1;
2865         else {
2866                 if (minimum == 0)
2867                         dev->minimum = iocount;
2868                 else
2869                         dev->minimum = minimum;
2870         }
2871
2872         /*
2873          * Move each buffer from the passed in list to our internal one.
2874          */
2875         buffer = ISC_LIST_HEAD(*buflist);
2876         while (buffer != NULL) {
2877                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
2878                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2879                 buffer = ISC_LIST_HEAD(*buflist);
2880         }
2881
2882         return (socket_recv(sock, dev, task, 0));
2883 }
2884
2885 isc_result_t
2886 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
2887                 isc_task_t *task, isc_taskaction_t action, const void *arg)
2888 {
2889         isc_socketevent_t *dev;
2890         isc_socketmgr_t *manager;
2891
2892         REQUIRE(VALID_SOCKET(sock));
2893         REQUIRE(action != NULL);
2894
2895         manager = sock->manager;
2896         REQUIRE(VALID_MANAGER(manager));
2897
2898         INSIST(sock->bound);
2899
2900         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2901         if (dev == NULL)
2902                 return (ISC_R_NOMEMORY);
2903
2904         return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
2905 }
2906
2907 isc_result_t
2908 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
2909                  unsigned int minimum, isc_task_t *task,
2910                  isc_socketevent_t *event, unsigned int flags)
2911 {
2912         event->ev_sender = sock;
2913         event->result = ISC_R_UNEXPECTED;
2914         ISC_LIST_INIT(event->bufferlist);
2915         event->region = *region;
2916         event->n = 0;
2917         event->offset = 0;
2918         event->attributes = 0;
2919
2920         /*
2921          * UDP sockets are always partial read.
2922          */
2923         if (sock->type == isc_sockettype_udp)
2924                 event->minimum = 1;
2925         else {
2926                 if (minimum == 0)
2927                         event->minimum = region->length;
2928                 else
2929                         event->minimum = minimum;
2930         }
2931
2932         return (socket_recv(sock, event, task, flags));
2933 }
2934
2935 static isc_result_t
2936 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2937             isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2938             unsigned int flags)
2939 {
2940         int io_state;
2941         isc_boolean_t have_lock = ISC_FALSE;
2942         isc_task_t *ntask = NULL;
2943         isc_result_t result = ISC_R_SUCCESS;
2944
2945         dev->ev_sender = task;
2946
2947         set_dev_address(address, sock, dev);
2948         if (pktinfo != NULL) {
2949                 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2950                 dev->pktinfo = *pktinfo;
2951
2952                 if (!isc_sockaddr_issitelocal(&dev->address) &&
2953                     !isc_sockaddr_islinklocal(&dev->address)) {
2954                         socket_log(sock, NULL, TRACE, isc_msgcat,
2955                                    ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
2956                                    "pktinfo structure provided, ifindex %u "
2957                                    "(set to 0)", pktinfo->ipi6_ifindex);
2958
2959                         /*
2960                          * Set the pktinfo index to 0 here, to let the
2961                          * kernel decide what interface it should send on.
2962                          */
2963                         dev->pktinfo.ipi6_ifindex = 0;
2964                 }
2965         }
2966
2967         if (sock->type == isc_sockettype_udp)
2968                 io_state = doio_send(sock, dev);
2969         else {
2970                 LOCK(&sock->lock);
2971                 have_lock = ISC_TRUE;
2972
2973                 if (ISC_LIST_EMPTY(sock->send_list))
2974                         io_state = doio_send(sock, dev);
2975                 else
2976                         io_state = DOIO_SOFT;
2977         }
2978
2979         switch (io_state) {
2980         case DOIO_SOFT:
2981                 /*
2982                  * We couldn't send all or part of the request right now, so
2983                  * queue it unless ISC_SOCKFLAG_NORETRY is set.
2984                  */
2985                 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2986                         isc_task_attach(task, &ntask);
2987                         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2988
2989                         if (!have_lock) {
2990                                 LOCK(&sock->lock);
2991                                 have_lock = ISC_TRUE;
2992                         }
2993
2994                         /*
2995                          * Enqueue the request.  If the socket was previously
2996                          * not being watched, poke the watcher to start
2997                          * paying attention to it.
2998                          */
2999                         if (ISC_LIST_EMPTY(sock->send_list))
3000                                 select_poke(sock->manager, sock->fd,
3001                                             SELECT_POKE_WRITE);
3002                         ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
3003
3004                         socket_log(sock, NULL, EVENT, NULL, 0, 0,
3005                                    "socket_send: event %p -> task %p",
3006                                    dev, ntask);
3007
3008                         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
3009                                 result = ISC_R_INPROGRESS;
3010                         break;
3011                 }
3012
3013         case DOIO_HARD:
3014         case DOIO_SUCCESS:
3015                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
3016                         send_senddone_event(sock, &dev);
3017                 break;
3018         }
3019
3020         if (have_lock)
3021                 UNLOCK(&sock->lock);
3022
3023         return (result);
3024 }
3025
3026 isc_result_t
3027 isc_socket_send(isc_socket_t *sock, isc_region_t *region,
3028                 isc_task_t *task, isc_taskaction_t action, const void *arg)
3029 {
3030         /*
3031          * REQUIRE() checking is performed in isc_socket_sendto().
3032          */
3033         return (isc_socket_sendto(sock, region, task, action, arg, NULL,
3034                                   NULL));
3035 }
3036
3037 isc_result_t
3038 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
3039                   isc_task_t *task, isc_taskaction_t action, const void *arg,
3040                   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3041 {
3042         isc_socketevent_t *dev;
3043         isc_socketmgr_t *manager;
3044
3045         REQUIRE(VALID_SOCKET(sock));
3046         REQUIRE(region != NULL);
3047         REQUIRE(task != NULL);
3048         REQUIRE(action != NULL);
3049
3050         manager = sock->manager;
3051         REQUIRE(VALID_MANAGER(manager));
3052
3053         INSIST(sock->bound);
3054
3055         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3056         if (dev == NULL) {
3057                 return (ISC_R_NOMEMORY);
3058         }
3059
3060         dev->region = *region;
3061
3062         return (socket_send(sock, dev, task, address, pktinfo, 0));
3063 }
3064
3065 isc_result_t
3066 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
3067                  isc_task_t *task, isc_taskaction_t action, const void *arg)
3068 {
3069         return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
3070                                    NULL));
3071 }
3072
3073 isc_result_t
3074 isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
3075                    isc_task_t *task, isc_taskaction_t action, const void *arg,
3076                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3077 {
3078         isc_socketevent_t *dev;
3079         isc_socketmgr_t *manager;
3080         unsigned int iocount;
3081         isc_buffer_t *buffer;
3082
3083         REQUIRE(VALID_SOCKET(sock));
3084         REQUIRE(buflist != NULL);
3085         REQUIRE(!ISC_LIST_EMPTY(*buflist));
3086         REQUIRE(task != NULL);
3087         REQUIRE(action != NULL);
3088
3089         manager = sock->manager;
3090         REQUIRE(VALID_MANAGER(manager));
3091
3092         iocount = isc_bufferlist_usedcount(buflist);
3093         REQUIRE(iocount > 0);
3094
3095         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3096         if (dev == NULL) {
3097                 return (ISC_R_NOMEMORY);
3098         }
3099
3100         /*
3101          * Move each buffer from the passed in list to our internal one.
3102          */
3103         buffer = ISC_LIST_HEAD(*buflist);
3104         while (buffer != NULL) {
3105                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
3106                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
3107                 buffer = ISC_LIST_HEAD(*buflist);
3108         }
3109
3110         return (socket_send(sock, dev, task, address, pktinfo, 0));
3111 }
3112
3113 isc_result_t
3114 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
3115                    isc_task_t *task,
3116                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
3117                    isc_socketevent_t *event, unsigned int flags)
3118 {
3119         REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
3120         if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
3121                 REQUIRE(sock->type == isc_sockettype_udp);
3122         event->ev_sender = sock;
3123         event->result = ISC_R_UNEXPECTED;
3124         ISC_LIST_INIT(event->bufferlist);
3125         event->region = *region;
3126         event->n = 0;
3127         event->offset = 0;
3128         event->attributes = 0;
3129
3130         return (socket_send(sock, event, task, address, pktinfo, flags));
3131 }
3132
3133 void
3134 isc_socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
3135 #ifdef ISC_PLATFORM_HAVESYSUNH
3136         int s;
3137         struct stat sb;
3138         char strbuf[ISC_STRERRORSIZE];
3139
3140         if (sockaddr->type.sa.sa_family != AF_UNIX)
3141                 return;
3142
3143 #ifndef S_ISSOCK
3144 #if defined(S_IFMT) && defined(S_IFSOCK)
3145 #define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
3146 #elif defined(_S_IFMT) && defined(S_IFSOCK)
3147 #define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
3148 #endif
3149 #endif
3150
3151 #ifndef S_ISFIFO
3152 #if defined(S_IFMT) && defined(S_IFIFO)
3153 #define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
3154 #elif defined(_S_IFMT) && defined(S_IFIFO)
3155 #define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
3156 #endif
3157 #endif
3158
3159 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
3160 #error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
3161 #endif
3162
3163 #ifndef S_ISFIFO
3164 #define S_ISFIFO(mode) 0
3165 #endif
3166
3167 #ifndef S_ISSOCK
3168 #define S_ISSOCK(mode) 0
3169 #endif
3170
3171         if (active) {
3172                 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
3173                         isc__strerror(errno, strbuf, sizeof(strbuf));
3174                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3175                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3176                                       "isc_socket_cleanunix: stat(%s): %s",
3177                                       sockaddr->type.sunix.sun_path, strbuf);
3178                         return;
3179                 }
3180                 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
3181                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3182                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3183                                       "isc_socket_cleanunix: %s: not a socket",
3184                                       sockaddr->type.sunix.sun_path);
3185                         return;
3186                 }
3187                 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
3188                         isc__strerror(errno, strbuf, sizeof(strbuf));
3189                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3190                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3191                                       "isc_socket_cleanunix: unlink(%s): %s",
3192                                       sockaddr->type.sunix.sun_path, strbuf);
3193                 }
3194                 return;
3195         }
3196
3197         s = socket(AF_UNIX, SOCK_STREAM, 0);
3198         if (s < 0) {
3199                 isc__strerror(errno, strbuf, sizeof(strbuf));
3200                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3201                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
3202                               "isc_socket_cleanunix: socket(%s): %s",
3203                               sockaddr->type.sunix.sun_path, strbuf);
3204                 return;
3205         }
3206
3207         if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
3208                 switch (errno) {
3209                 case ENOENT:    /* We exited cleanly last time */
3210                         break;
3211                 default:
3212                         isc__strerror(errno, strbuf, sizeof(strbuf));
3213                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3214                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
3215                                       "isc_socket_cleanunix: stat(%s): %s",
3216                                       sockaddr->type.sunix.sun_path, strbuf);
3217                         break;
3218                 }
3219                 goto cleanup;
3220         }
3221
3222         if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
3223                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3224                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
3225                               "isc_socket_cleanunix: %s: not a socket",
3226                               sockaddr->type.sunix.sun_path);
3227                 goto cleanup;
3228         }
3229
3230         if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
3231                     sizeof(sockaddr->type.sunix)) < 0) {
3232                 switch (errno) {
3233                 case ECONNREFUSED:
3234                 case ECONNRESET:
3235                         if (unlink(sockaddr->type.sunix.sun_path) < 0) {
3236                                 isc__strerror(errno, strbuf, sizeof(strbuf));
3237                                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3238                                               ISC_LOGMODULE_SOCKET,
3239                                               ISC_LOG_WARNING,
3240                                               "isc_socket_cleanunix: "
3241                                               "unlink(%s): %s",
3242                                               sockaddr->type.sunix.sun_path,
3243                                               strbuf);
3244                         }
3245                         break;
3246                 default:
3247                         isc__strerror(errno, strbuf, sizeof(strbuf));
3248                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3249                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
3250                                       "isc_socket_cleanunix: connect(%s): %s",
3251                                       sockaddr->type.sunix.sun_path, strbuf);
3252                         break;
3253                 }
3254         }
3255  cleanup:
3256         close(s);
3257 #else
3258         UNUSED(sockaddr);
3259         UNUSED(active);
3260 #endif
3261 }
3262
3263 isc_result_t
3264 isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
3265                     isc_uint32_t owner, isc_uint32_t group)
3266 {
3267 #ifdef ISC_PLATFORM_HAVESYSUNH
3268         isc_result_t result = ISC_R_SUCCESS;
3269         char strbuf[ISC_STRERRORSIZE];
3270         char path[sizeof(sockaddr->type.sunix.sun_path)];
3271 #ifdef NEED_SECURE_DIRECTORY
3272         char *slash;
3273 #endif
3274
3275         REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
3276         INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
3277         strcpy(path, sockaddr->type.sunix.sun_path);
3278
3279 #ifdef NEED_SECURE_DIRECTORY
3280         slash = strrchr(path, '/');
3281         if (slash != NULL) {
3282                 if (slash != path)
3283                         *slash = '\0';
3284                 else
3285                         strcpy(path, "/");
3286         } else
3287                 strcpy(path, ".");
3288 #endif
3289         
3290         if (chmod(path, perm) < 0) {
3291                 isc__strerror(errno, strbuf, sizeof(strbuf));
3292                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3293                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3294                               "isc_socket_permunix: chmod(%s, %d): %s",
3295                               path, perm, strbuf);
3296                 result = ISC_R_FAILURE;
3297         }
3298         if (chown(path, owner, group) < 0) {
3299                 isc__strerror(errno, strbuf, sizeof(strbuf));
3300                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3301                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3302                               "isc_socket_permunix: chown(%s, %d, %d): %s",
3303                               path, owner, group,
3304                               strbuf);
3305                 result = ISC_R_FAILURE;
3306         }
3307         return (result);
3308 #else
3309         UNUSED(sockaddr);
3310         UNUSED(perm);
3311         UNUSED(owner);
3312         UNUSED(group);
3313         return (ISC_R_NOTIMPLEMENTED);
3314 #endif
3315 }
3316
3317 isc_result_t
3318 isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr, 
3319                 unsigned int options) {
3320         char strbuf[ISC_STRERRORSIZE];
3321         int on = 1;
3322
3323         LOCK(&sock->lock);
3324
3325         INSIST(!sock->bound);
3326
3327         if (sock->pf != sockaddr->type.sa.sa_family) {
3328                 UNLOCK(&sock->lock);
3329                 return (ISC_R_FAMILYMISMATCH);
3330         }
3331         /*
3332          * Only set SO_REUSEADDR when we want a specific port.
3333          */
3334 #ifdef AF_UNIX
3335         if (sock->pf == AF_UNIX)
3336                 goto bind_socket;
3337 #endif
3338         if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
3339             isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
3340             setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
3341                        sizeof(on)) < 0) {
3342                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3343                                  "setsockopt(%d) %s", sock->fd,
3344                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3345                                                 ISC_MSG_FAILED, "failed"));
3346                 /* Press on... */
3347         }
3348 #ifdef AF_UNIX
3349  bind_socket:
3350 #endif
3351         if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
3352                 UNLOCK(&sock->lock);
3353                 switch (errno) {
3354                 case EACCES:
3355                         return (ISC_R_NOPERM);
3356                 case EADDRNOTAVAIL:
3357                         return (ISC_R_ADDRNOTAVAIL);
3358                 case EADDRINUSE:
3359                         return (ISC_R_ADDRINUSE);
3360                 case EINVAL:
3361                         return (ISC_R_BOUND);
3362                 default:
3363                         isc__strerror(errno, strbuf, sizeof(strbuf));
3364                         UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
3365                                          strbuf);
3366                         return (ISC_R_UNEXPECTED);
3367                 }
3368         }
3369
3370         socket_log(sock, sockaddr, TRACE,
3371                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
3372         sock->bound = 1;
3373
3374         UNLOCK(&sock->lock);
3375         return (ISC_R_SUCCESS);
3376 }
3377
3378 isc_result_t
3379 isc_socket_filter(isc_socket_t *sock, const char *filter) {
3380 #ifdef SO_ACCEPTFILTER
3381         char strbuf[ISC_STRERRORSIZE];
3382         struct accept_filter_arg afa;
3383 #else
3384         UNUSED(sock);
3385         UNUSED(filter);
3386 #endif
3387
3388         REQUIRE(VALID_SOCKET(sock));
3389
3390 #ifdef SO_ACCEPTFILTER
3391         bzero(&afa, sizeof(afa));
3392         strncpy(afa.af_name, filter, sizeof(afa.af_name));
3393         if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
3394                          &afa, sizeof(afa)) == -1) {
3395                 isc__strerror(errno, strbuf, sizeof(strbuf));
3396                 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
3397                            ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
3398                            strbuf);
3399                 return (ISC_R_FAILURE);
3400         }
3401         return (ISC_R_SUCCESS);
3402 #else
3403         return (ISC_R_NOTIMPLEMENTED);
3404 #endif
3405 }
3406
3407 /*
3408  * Set up to listen on a given socket.  We do this by creating an internal
3409  * event that will be dispatched when the socket has read activity.  The
3410  * watcher will send the internal event to the task when there is a new
3411  * connection.
3412  *
3413  * Unlike in read, we don't preallocate a done event here.  Every time there
3414  * is a new connection we'll have to allocate a new one anyway, so we might
3415  * as well keep things simple rather than having to track them.
3416  */
3417 isc_result_t
3418 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
3419         char strbuf[ISC_STRERRORSIZE];
3420
3421         REQUIRE(VALID_SOCKET(sock));
3422
3423         LOCK(&sock->lock);
3424
3425         REQUIRE(!sock->listener);
3426         REQUIRE(sock->bound);
3427         REQUIRE(sock->type == isc_sockettype_tcp ||
3428                 sock->type == isc_sockettype_unix);
3429
3430         if (backlog == 0)
3431                 backlog = SOMAXCONN;
3432
3433         if (listen(sock->fd, (int)backlog) < 0) {
3434                 UNLOCK(&sock->lock);
3435                 isc__strerror(errno, strbuf, sizeof(strbuf));
3436
3437                 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
3438
3439                 return (ISC_R_UNEXPECTED);
3440         }
3441
3442         sock->listener = 1;
3443
3444         UNLOCK(&sock->lock);
3445         return (ISC_R_SUCCESS);
3446 }
3447
3448 /*
3449  * This should try to do agressive accept() XXXMLG
3450  */
3451 isc_result_t
3452 isc_socket_accept(isc_socket_t *sock,
3453                   isc_task_t *task, isc_taskaction_t action, const void *arg)
3454 {
3455         isc_socket_newconnev_t *dev;
3456         isc_socketmgr_t *manager;
3457         isc_task_t *ntask = NULL;
3458         isc_socket_t *nsock;
3459         isc_result_t result;
3460         isc_boolean_t do_poke = ISC_FALSE;
3461
3462         REQUIRE(VALID_SOCKET(sock));
3463         manager = sock->manager;
3464         REQUIRE(VALID_MANAGER(manager));
3465
3466         LOCK(&sock->lock);
3467
3468         REQUIRE(sock->listener);
3469
3470         /*
3471          * Sender field is overloaded here with the task we will be sending
3472          * this event to.  Just before the actual event is delivered the
3473          * actual ev_sender will be touched up to be the socket.
3474          */
3475         dev = (isc_socket_newconnev_t *)
3476                 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
3477                                    action, arg, sizeof(*dev));
3478         if (dev == NULL) {
3479                 UNLOCK(&sock->lock);
3480                 return (ISC_R_NOMEMORY);
3481         }
3482         ISC_LINK_INIT(dev, ev_link);
3483
3484         result = allocate_socket(manager, sock->type, &nsock);
3485         if (result != ISC_R_SUCCESS) {
3486                 isc_event_free(ISC_EVENT_PTR(&dev));
3487                 UNLOCK(&sock->lock);
3488                 return (result);
3489         }
3490
3491         /*
3492          * Attach to socket and to task.
3493          */
3494         isc_task_attach(task, &ntask);
3495         nsock->references++;
3496
3497         dev->ev_sender = ntask;
3498         dev->newsocket = nsock;
3499
3500         /*
3501          * Poke watcher here.  We still have the socket locked, so there
3502          * is no race condition.  We will keep the lock for such a short
3503          * bit of time waking it up now or later won't matter all that much.
3504          */
3505         if (ISC_LIST_EMPTY(sock->accept_list))
3506                 do_poke = ISC_TRUE;
3507
3508         ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
3509
3510         if (do_poke)
3511                 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
3512
3513         UNLOCK(&sock->lock);
3514         return (ISC_R_SUCCESS);
3515 }
3516
3517 isc_result_t
3518 isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
3519                    isc_task_t *task, isc_taskaction_t action, const void *arg)
3520 {
3521         isc_socket_connev_t *dev;
3522         isc_task_t *ntask = NULL;
3523         isc_socketmgr_t *manager;
3524         int cc;
3525         char strbuf[ISC_STRERRORSIZE];
3526
3527         REQUIRE(VALID_SOCKET(sock));
3528         REQUIRE(addr != NULL);
3529         REQUIRE(task != NULL);
3530         REQUIRE(action != NULL);
3531
3532         manager = sock->manager;
3533         REQUIRE(VALID_MANAGER(manager));
3534         REQUIRE(addr != NULL);
3535
3536         if (isc_sockaddr_ismulticast(addr))
3537                 return (ISC_R_MULTICAST);
3538
3539         LOCK(&sock->lock);
3540
3541         REQUIRE(!sock->connecting);
3542
3543         dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
3544                                                         ISC_SOCKEVENT_CONNECT,
3545                                                         action, arg,
3546                                                         sizeof(*dev));
3547         if (dev == NULL) {
3548                 UNLOCK(&sock->lock);
3549                 return (ISC_R_NOMEMORY);
3550         }
3551         ISC_LINK_INIT(dev, ev_link);
3552
3553         /*
3554          * Try to do the connect right away, as there can be only one
3555          * outstanding, and it might happen to complete.
3556          */
3557         sock->address = *addr;
3558         cc = connect(sock->fd, &addr->type.sa, addr->length);
3559         if (cc < 0) {
3560                 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
3561                         goto queue;
3562
3563                 switch (errno) {
3564 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
3565                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
3566                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
3567                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
3568                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
3569                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
3570 #ifdef EHOSTDOWN
3571                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
3572 #endif
3573                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
3574                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
3575                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
3576                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
3577                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
3578 #undef ERROR_MATCH
3579                 }
3580
3581                 sock->connected = 0;
3582
3583                 isc__strerror(errno, strbuf, sizeof(strbuf));
3584                 UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", errno, strbuf);
3585
3586                 UNLOCK(&sock->lock);
3587                 isc_event_free(ISC_EVENT_PTR(&dev));
3588                 return (ISC_R_UNEXPECTED);
3589
3590         err_exit:
3591                 sock->connected = 0;
3592                 isc_task_send(task, ISC_EVENT_PTR(&dev));
3593
3594                 UNLOCK(&sock->lock);
3595                 return (ISC_R_SUCCESS);
3596         }
3597
3598         /*
3599          * If connect completed, fire off the done event.
3600          */
3601         if (cc == 0) {
3602                 sock->connected = 1;
3603                 sock->bound = 1;
3604                 dev->result = ISC_R_SUCCESS;
3605                 isc_task_send(task, ISC_EVENT_PTR(&dev));
3606
3607                 UNLOCK(&sock->lock);
3608                 return (ISC_R_SUCCESS);
3609         }
3610
3611  queue:
3612
3613         /*
3614          * Attach to task.
3615          */
3616         isc_task_attach(task, &ntask);
3617
3618         sock->connecting = 1;
3619
3620         dev->ev_sender = ntask;
3621
3622         /*
3623          * Poke watcher here.  We still have the socket locked, so there
3624          * is no race condition.  We will keep the lock for such a short
3625          * bit of time waking it up now or later won't matter all that much.
3626          */
3627         if (sock->connect_ev == NULL)
3628                 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
3629
3630         sock->connect_ev = dev;
3631
3632         UNLOCK(&sock->lock);
3633         return (ISC_R_SUCCESS);
3634 }
3635
3636 /*
3637  * Called when a socket with a pending connect() finishes.
3638  */
3639 static void
3640 internal_connect(isc_task_t *me, isc_event_t *ev) {
3641         isc_socket_t *sock;
3642         isc_socket_connev_t *dev;
3643         isc_task_t *task;
3644         int cc;
3645         ISC_SOCKADDR_LEN_T optlen;
3646         char strbuf[ISC_STRERRORSIZE];
3647         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
3648
3649         UNUSED(me);
3650         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3651
3652         sock = ev->ev_sender;
3653         INSIST(VALID_SOCKET(sock));
3654
3655         LOCK(&sock->lock);
3656
3657         /*
3658          * When the internal event was sent the reference count was bumped
3659          * to keep the socket around for us.  Decrement the count here.
3660          */
3661         INSIST(sock->references > 0);
3662         sock->references--;
3663         if (sock->references == 0) {
3664                 UNLOCK(&sock->lock);
3665                 destroy(&sock);
3666                 return;
3667         }
3668
3669         /*
3670          * Has this event been canceled?
3671          */
3672         dev = sock->connect_ev;
3673         if (dev == NULL) {
3674                 INSIST(!sock->connecting);
3675                 UNLOCK(&sock->lock);
3676                 return;
3677         }
3678
3679         INSIST(sock->connecting);
3680         sock->connecting = 0;
3681
3682         /*
3683          * Get any possible error status here.
3684          */
3685         optlen = sizeof(cc);
3686         if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
3687                        (void *)&cc, (void *)&optlen) < 0)
3688                 cc = errno;
3689         else
3690                 errno = cc;
3691
3692         if (errno != 0) {
3693                 /*
3694                  * If the error is EAGAIN, just re-select on this
3695                  * fd and pretend nothing strange happened.
3696                  */
3697                 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
3698                         sock->connecting = 1;
3699                         select_poke(sock->manager, sock->fd,
3700                                     SELECT_POKE_CONNECT);
3701                         UNLOCK(&sock->lock);
3702
3703                         return;
3704                 }
3705
3706                 /*
3707                  * Translate other errors into ISC_R_* flavors.
3708                  */
3709                 switch (errno) {
3710 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
3711                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
3712                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
3713                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
3714                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
3715                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
3716 #ifdef EHOSTDOWN
3717                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
3718 #endif
3719                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
3720                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
3721                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
3722                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
3723                         ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
3724                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
3725 #undef ERROR_MATCH
3726                 default:
3727                         dev->result = ISC_R_UNEXPECTED;
3728                         isc_sockaddr_format(&sock->address, peerbuf,
3729                                             sizeof(peerbuf));
3730                         isc__strerror(errno, strbuf, sizeof(strbuf));
3731                         UNEXPECTED_ERROR(__FILE__, __LINE__,
3732                                          "internal_connect: connect(%s) %s",
3733                                          peerbuf, strbuf);
3734                 }
3735         } else {
3736                 dev->result = ISC_R_SUCCESS;
3737                 sock->connected = 1;
3738                 sock->bound = 1;
3739         }
3740
3741         sock->connect_ev = NULL;
3742
3743         UNLOCK(&sock->lock);
3744
3745         task = dev->ev_sender;
3746         dev->ev_sender = sock;
3747         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
3748 }
3749
3750 isc_result_t
3751 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3752         isc_result_t result;
3753
3754         REQUIRE(VALID_SOCKET(sock));
3755         REQUIRE(addressp != NULL);
3756
3757         LOCK(&sock->lock);
3758
3759         if (sock->connected) {
3760                 *addressp = sock->address;
3761                 result = ISC_R_SUCCESS;
3762         } else {
3763                 result = ISC_R_NOTCONNECTED;
3764         }
3765
3766         UNLOCK(&sock->lock);
3767
3768         return (result);
3769 }
3770
3771 isc_result_t
3772 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3773         ISC_SOCKADDR_LEN_T len;
3774         isc_result_t result;
3775         char strbuf[ISC_STRERRORSIZE];
3776
3777         REQUIRE(VALID_SOCKET(sock));
3778         REQUIRE(addressp != NULL);
3779
3780         LOCK(&sock->lock);
3781
3782         if (!sock->bound) {
3783                 result = ISC_R_NOTBOUND;
3784                 goto out;
3785         }
3786
3787         result = ISC_R_SUCCESS;
3788
3789         len = sizeof(addressp->type);
3790         if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
3791                 isc__strerror(errno, strbuf, sizeof(strbuf));
3792                 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
3793                                  strbuf);
3794                 result = ISC_R_UNEXPECTED;
3795                 goto out;
3796         }
3797         addressp->length = (unsigned int)len;
3798
3799  out:
3800         UNLOCK(&sock->lock);
3801
3802         return (result);
3803 }
3804
3805 /*
3806  * Run through the list of events on this socket, and cancel the ones
3807  * queued for task "task" of type "how".  "how" is a bitmask.
3808  */
3809 void
3810 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
3811
3812         REQUIRE(VALID_SOCKET(sock));
3813
3814         /*
3815          * Quick exit if there is nothing to do.  Don't even bother locking
3816          * in this case.
3817          */
3818         if (how == 0)
3819                 return;
3820
3821         LOCK(&sock->lock);
3822
3823         /*
3824          * All of these do the same thing, more or less.
3825          * Each will:
3826          *      o If the internal event is marked as "posted" try to
3827          *        remove it from the task's queue.  If this fails, mark it
3828          *        as canceled instead, and let the task clean it up later.
3829          *      o For each I/O request for that task of that type, post
3830          *        its done event with status of "ISC_R_CANCELED".
3831          *      o Reset any state needed.
3832          */
3833         if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
3834             && !ISC_LIST_EMPTY(sock->recv_list)) {
3835                 isc_socketevent_t      *dev;
3836                 isc_socketevent_t      *next;
3837                 isc_task_t             *current_task;
3838
3839                 dev = ISC_LIST_HEAD(sock->recv_list);
3840
3841                 while (dev != NULL) {
3842                         current_task = dev->ev_sender;
3843                         next = ISC_LIST_NEXT(dev, ev_link);
3844
3845                         if ((task == NULL) || (task == current_task)) {
3846                                 dev->result = ISC_R_CANCELED;
3847                                 send_recvdone_event(sock, &dev);
3848                         }
3849                         dev = next;
3850                 }
3851         }
3852
3853         if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
3854             && !ISC_LIST_EMPTY(sock->send_list)) {
3855                 isc_socketevent_t      *dev;
3856                 isc_socketevent_t      *next;
3857                 isc_task_t             *current_task;
3858
3859                 dev = ISC_LIST_HEAD(sock->send_list);
3860
3861                 while (dev != NULL) {
3862                         current_task = dev->ev_sender;
3863                         next = ISC_LIST_NEXT(dev, ev_link);
3864
3865                         if ((task == NULL) || (task == current_task)) {
3866                                 dev->result = ISC_R_CANCELED;
3867                                 send_senddone_event(sock, &dev);
3868                         }
3869                         dev = next;
3870                 }
3871         }
3872
3873         if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
3874             && !ISC_LIST_EMPTY(sock->accept_list)) {
3875                 isc_socket_newconnev_t *dev;
3876                 isc_socket_newconnev_t *next;
3877                 isc_task_t             *current_task;
3878
3879                 dev = ISC_LIST_HEAD(sock->accept_list);
3880                 while (dev != NULL) {
3881                         current_task = dev->ev_sender;
3882                         next = ISC_LIST_NEXT(dev, ev_link);
3883
3884                         if ((task == NULL) || (task == current_task)) {
3885
3886                                 ISC_LIST_UNLINK(sock->accept_list, dev,
3887                                                 ev_link);
3888
3889                                 dev->newsocket->references--;
3890                                 free_socket(&dev->newsocket);
3891
3892                                 dev->result = ISC_R_CANCELED;
3893                                 dev->ev_sender = sock;
3894                                 isc_task_sendanddetach(&current_task,
3895                                                        ISC_EVENT_PTR(&dev));
3896                         }
3897
3898                         dev = next;
3899                 }
3900         }
3901
3902         /*
3903          * Connecting is not a list.
3904          */
3905         if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
3906             && sock->connect_ev != NULL) {
3907                 isc_socket_connev_t    *dev;
3908                 isc_task_t             *current_task;
3909
3910                 INSIST(sock->connecting);
3911                 sock->connecting = 0;
3912
3913                 dev = sock->connect_ev;
3914                 current_task = dev->ev_sender;
3915
3916                 if ((task == NULL) || (task == current_task)) {
3917                         sock->connect_ev = NULL;
3918
3919                         dev->result = ISC_R_CANCELED;
3920                         dev->ev_sender = sock;
3921                         isc_task_sendanddetach(&current_task,
3922                                                ISC_EVENT_PTR(&dev));
3923                 }
3924         }
3925
3926         UNLOCK(&sock->lock);
3927 }
3928
3929 isc_sockettype_t
3930 isc_socket_gettype(isc_socket_t *sock) {
3931         REQUIRE(VALID_SOCKET(sock));
3932
3933         return (sock->type);
3934 }
3935
3936 isc_boolean_t
3937 isc_socket_isbound(isc_socket_t *sock) {
3938         isc_boolean_t val;
3939
3940         LOCK(&sock->lock);
3941         val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
3942         UNLOCK(&sock->lock);
3943
3944         return (val);
3945 }
3946
3947 void
3948 isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
3949 #if defined(IPV6_V6ONLY)
3950         int onoff = yes ? 1 : 0;
3951 #else
3952         UNUSED(yes);
3953         UNUSED(sock);
3954 #endif
3955
3956         REQUIRE(VALID_SOCKET(sock));
3957
3958 #ifdef IPV6_V6ONLY
3959         if (sock->pf == AF_INET6) {
3960                 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
3961                                  (void *)&onoff, sizeof(onoff));
3962         }
3963 #endif
3964 }
3965
3966 #ifndef ISC_PLATFORM_USETHREADS
3967 void
3968 isc__socketmgr_getfdsets(fd_set **readset, fd_set **writeset, int *maxfd) {
3969         if (socketmgr == NULL)
3970                 *maxfd = 0;
3971         else {
3972                 /* Prepare duplicates of fd_sets, as select() will modify */
3973                 memcpy(socketmgr->read_fds_copy, socketmgr->read_fds,
3974                        socketmgr->fd_bufsize);
3975                 memcpy(socketmgr->write_fds_copy, socketmgr->write_fds,
3976                        socketmgr->fd_bufsize);
3977                 *readset = socketmgr->read_fds_copy;
3978                 *writeset = socketmgr->write_fds_copy;
3979                 *maxfd = socketmgr->maxfd + 1;
3980         }
3981 }
3982
3983 isc_result_t
3984 isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd) {
3985         isc_socketmgr_t *manager = socketmgr;
3986
3987         if (manager == NULL)
3988                 return (ISC_R_NOTFOUND);
3989
3990         process_fds(manager, maxfd, readset, writeset);
3991         return (ISC_R_SUCCESS);
3992 }
3993 #endif /* ISC_PLATFORM_USETHREADS */