]> CyberLeo.Net >> Repos - FreeBSD/releng/10.2.git/blob - contrib/ntp/lib/isc/win32/socket.c
- Copy stable/10@285827 to releng/10.2 in preparation for 10.2-RC1
[FreeBSD/releng/10.2.git] / contrib / ntp / lib / isc / win32 / socket.c
1 /*
2  * Copyright (C) 2004-2012  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 2000-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id$ */
19
20 /* This code uses functions which are only available on Server 2003 and
21  * higher, and Windows XP and higher.
22  *
23  * This code is by nature multithreaded and takes advantage of various
24  * features to pass on information through the completion port for
25  * when I/O is completed.  All sends, receives, accepts, and connects are
26  * completed through the completion port.
27  *
28  * The number of Completion Port Worker threads used is the total number
29  * of CPU's + 1. This increases the likelihood that a Worker Thread is
30  * available for processing a completed request.
31  *
32  * XXXPDM 5 August, 2002
33  */
34
35 #define MAKE_EXTERNAL 1
36 #include <config.h>
37
38 #include <sys/types.h>
39
40 #ifndef _WINSOCKAPI_
41 #define _WINSOCKAPI_   /* Prevent inclusion of winsock.h in windows.h */
42 #endif
43
44 #include <errno.h>
45 #include <stddef.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <unistd.h>
49 #include <io.h>
50 #include <fcntl.h>
51 #include <process.h>
52
53 #include <isc/buffer.h>
54 #include <isc/bufferlist.h>
55 #include <isc/condition.h>
56 #include <isc/list.h>
57 #include <isc/log.h>
58 #include <isc/mem.h>
59 #include <isc/msgs.h>
60 #include <isc/mutex.h>
61 #include <isc/net.h>
62 #include <isc/once.h>
63 #include <isc/os.h>
64 #include <isc/platform.h>
65 #include <isc/print.h>
66 #include <isc/region.h>
67 #include <isc/socket.h>
68 #include <isc/stats.h>
69 #include <isc/strerror.h>
70 #include <isc/syslog.h>
71 #include <isc/task.h>
72 #include <isc/thread.h>
73 #include <isc/util.h>
74 #include <isc/win32os.h>
75
76 #include <mswsock.h>
77
78 #include "errno2result.h"
79
80 /*
81  * How in the world can Microsoft exist with APIs like this?
82  * We can't actually call this directly, because it turns out
83  * no library exports this function.  Instead, we need to
84  * issue a runtime call to get the address.
85  */
86 LPFN_CONNECTEX ISCConnectEx;
87 LPFN_ACCEPTEX ISCAcceptEx;
88 LPFN_GETACCEPTEXSOCKADDRS ISCGetAcceptExSockaddrs;
89
90 /*
91  * Run expensive internal consistency checks.
92  */
93 #ifdef ISC_SOCKET_CONSISTENCY_CHECKS
94 #define CONSISTENT(sock) consistent(sock)
95 #else
96 #define CONSISTENT(sock) do {} while (0)
97 #endif
98 static void consistent(isc_socket_t *sock);
99
100 /*
101  * Define this macro to control the behavior of connection
102  * resets on UDP sockets.  See Microsoft KnowledgeBase Article Q263823
103  * for details.
104  * NOTE: This requires that Windows 2000 systems install Service Pack 2
105  * or later.
106  */
107 #ifndef SIO_UDP_CONNRESET
108 #define SIO_UDP_CONNRESET _WSAIOW(IOC_VENDOR,12)
109 #endif
110
111 /*
112  * Some systems define the socket length argument as an int, some as size_t,
113  * some as socklen_t.  This is here so it can be easily changed if needed.
114  */
115 #ifndef ISC_SOCKADDR_LEN_T
116 #define ISC_SOCKADDR_LEN_T unsigned int
117 #endif
118
119 /*
120  * Define what the possible "soft" errors can be.  These are non-fatal returns
121  * of various network related functions, like recv() and so on.
122  */
123 #define SOFT_ERROR(e)   ((e) == WSAEINTR || \
124                          (e) == WSAEWOULDBLOCK || \
125                          (e) == EWOULDBLOCK || \
126                          (e) == EINTR || \
127                          (e) == EAGAIN || \
128                          (e) == 0)
129
130 /*
131  * Pending errors are not really errors and should be
132  * kept separate
133  */
134 #define PENDING_ERROR(e) ((e) == WSA_IO_PENDING || (e) == 0)
135
136 #define DOIO_SUCCESS      0       /* i/o ok, event sent */
137 #define DOIO_SOFT         1       /* i/o ok, soft error, no event sent */
138 #define DOIO_HARD         2       /* i/o error, event sent */
139 #define DOIO_EOF          3       /* EOF, no event sent */
140 #define DOIO_PENDING      4       /* status when i/o is in process */
141 #define DOIO_NEEDMORE     5       /* IO was processed, but we need more due to minimum */
142
143 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
144
145 /*
146  * DLVL(90)  --  Function entry/exit and other tracing.
147  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
148  * DLVL(60)  --  Socket data send/receive
149  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
150  * DLVL(20)  --  Socket creation/destruction.
151  */
152 #define TRACE_LEVEL             90
153 #define CORRECTNESS_LEVEL       70
154 #define IOEVENT_LEVEL           60
155 #define EVENT_LEVEL             50
156 #define CREATION_LEVEL          20
157
158 #define TRACE           DLVL(TRACE_LEVEL)
159 #define CORRECTNESS     DLVL(CORRECTNESS_LEVEL)
160 #define IOEVENT         DLVL(IOEVENT_LEVEL)
161 #define EVENT           DLVL(EVENT_LEVEL)
162 #define CREATION        DLVL(CREATION_LEVEL)
163
164 typedef isc_event_t intev_t;
165
166 /*
167  * Socket State
168  */
169 enum {
170   SOCK_INITIALIZED,     /* Socket Initialized */
171   SOCK_OPEN,            /* Socket opened but nothing yet to do */
172   SOCK_DATA,            /* Socket sending or receiving data */
173   SOCK_LISTEN,          /* TCP Socket listening for connects */
174   SOCK_ACCEPT,          /* TCP socket is waiting to accept */
175   SOCK_CONNECT,         /* TCP Socket connecting */
176   SOCK_CLOSED,          /* Socket has been closed */
177 };
178
179 #define SOCKET_MAGIC            ISC_MAGIC('I', 'O', 'i', 'o')
180 #define VALID_SOCKET(t)         ISC_MAGIC_VALID(t, SOCKET_MAGIC)
181
182 /*
183  * IPv6 control information.  If the socket is an IPv6 socket we want
184  * to collect the destination address and interface so the client can
185  * set them on outgoing packets.
186  */
187 #ifdef ISC_PLATFORM_HAVEIPV6
188 #ifndef USE_CMSG
189 #define USE_CMSG        1
190 #endif
191 #endif
192
193 /*
194  * We really  don't want to try and use these control messages. Win32
195  * doesn't have this mechanism before XP.
196  */
197 #undef USE_CMSG
198
199 /*
200  * Message header for recvmsg and sendmsg calls.
201  * Used value-result for recvmsg, value only for sendmsg.
202  */
203 struct msghdr {
204         SOCKADDR_STORAGE to_addr;       /* UDP send/recv address */
205         int      to_addr_len;           /* length of the address */
206         WSABUF  *msg_iov;               /* scatter/gather array */
207         u_int   msg_iovlen;             /* # elements in msg_iov */
208         void    *msg_control;           /* ancillary data, see below */
209         u_int   msg_controllen;         /* ancillary data buffer len */
210         int     msg_totallen;           /* total length of this message */
211 } msghdr;
212
213 /*
214  * The size to raise the receive buffer to.
215  */
216 #define RCVBUFSIZE (32*1024)
217
218 /*
219  * The number of times a send operation is repeated if the result
220  * is WSAEINTR.
221  */
222 #define NRETRIES 10
223
224 struct isc_socket {
225         /* Not locked. */
226         unsigned int            magic;
227         isc_socketmgr_t        *manager;
228         isc_mutex_t             lock;
229         isc_sockettype_t        type;
230
231         /* Pointers to scatter/gather buffers */
232         WSABUF                  iov[ISC_SOCKET_MAXSCATTERGATHER];
233
234         /* Locked by socket lock. */
235         ISC_LINK(isc_socket_t)  link;
236         unsigned int            references; /* EXTERNAL references */
237         SOCKET                  fd;     /* file handle */
238         int                     pf;     /* protocol family */
239         char                    name[16];
240         void *                  tag;
241
242         /*
243          * Each recv() call uses this buffer.  It is a per-socket receive
244          * buffer that allows us to decouple the system recv() from the
245          * recv_list done events.  This means the items on the recv_list
246          * can be removed without having to cancel pending system recv()
247          * calls.  It also allows us to read-ahead in some cases.
248          */
249         struct {
250                 SOCKADDR_STORAGE        from_addr;         // UDP send/recv address
251                 int             from_addr_len;     // length of the address
252                 char            *base;             // the base of the buffer
253                 char            *consume_position; // where to start copying data from next
254                 unsigned int    len;               // the actual size of this buffer
255                 unsigned int    remaining;         // the number of bytes remaining
256         } recvbuf;
257
258         ISC_LIST(isc_socketevent_t)             send_list;
259         ISC_LIST(isc_socketevent_t)             recv_list;
260         ISC_LIST(isc_socket_newconnev_t)        accept_list;
261         isc_socket_connev_t                    *connect_ev;
262
263         isc_sockaddr_t          address;  /* remote address */
264
265         unsigned int            listener : 1,   /* listener socket */
266                                 connected : 1,
267                                 pending_connect : 1, /* connect pending */
268                                 bound : 1,      /* bound to local addr */
269                                 dupped : 1;     /* created by isc_socket_dup() */
270         unsigned int            pending_iocp;   /* Should equal the counters below. Debug. */
271         unsigned int            pending_recv;  /* Number of outstanding recv() calls. */
272         unsigned int            pending_send;  /* Number of outstanding send() calls. */
273         unsigned int            pending_accept; /* Number of outstanding accept() calls. */
274         unsigned int            state; /* Socket state. Debugging and consistency checking. */
275         int                     state_lineno;  /* line which last touched state */
276 };
277
278 #define _set_state(sock, _state) do { (sock)->state = (_state); (sock)->state_lineno = __LINE__; } while (0)
279
280 /*
281  * Buffer structure
282  */
283 typedef struct buflist buflist_t;
284
285 struct buflist {
286         void                    *buf;
287         unsigned int            buflen;
288         ISC_LINK(buflist_t)     link;
289 };
290
291 /*
292  * I/O Completion ports Info structures
293  */
294
295 static HANDLE hHeapHandle = NULL;
296 typedef struct IoCompletionInfo {
297         OVERLAPPED              overlapped;
298         isc_socketevent_t       *dev;  /* send()/recv() done event */
299         isc_socket_connev_t     *cdev; /* connect() done event */
300         isc_socket_newconnev_t  *adev; /* accept() done event */
301         void                    *acceptbuffer;
302         DWORD                   received_bytes;
303         int                     request_type;
304         struct msghdr           messagehdr;
305         ISC_LIST(buflist_t)     bufferlist;     /*%< list of buffers */
306 } IoCompletionInfo;
307
308 /*
309  * Define a maximum number of I/O Completion Port worker threads
310  * to handle the load on the Completion Port. The actual number
311  * used is the number of CPU's + 1.
312  */
313 #define MAX_IOCPTHREADS 20
314
315 #define SOCKET_MANAGER_MAGIC    ISC_MAGIC('I', 'O', 'm', 'g')
316 #define VALID_MANAGER(m)        ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
317
318 struct isc_socketmgr {
319         /* Not locked. */
320         unsigned int                    magic;
321         isc_mem_t                      *mctx;
322         isc_mutex_t                     lock;
323         isc_stats_t                    *stats;
324
325         /* Locked by manager lock. */
326         ISC_LIST(isc_socket_t)          socklist;
327         isc_boolean_t                   bShutdown;
328         isc_condition_t                 shutdown_ok;
329         HANDLE                          hIoCompletionPort;
330         int                             maxIOCPThreads;
331         HANDLE                          hIOCPThreads[MAX_IOCPTHREADS];
332         DWORD                           dwIOCPThreadIds[MAX_IOCPTHREADS];
333
334         /*
335          * Debugging.
336          * Modified by InterlockedIncrement() and InterlockedDecrement()
337          */
338         LONG                            totalSockets;
339         LONG                            iocp_total;
340 };
341
342 enum {
343         SOCKET_RECV,
344         SOCKET_SEND,
345         SOCKET_ACCEPT,
346         SOCKET_CONNECT
347 };
348
349 /*
350  * send() and recv() iovec counts
351  */
352 #define MAXSCATTERGATHER_SEND   (ISC_SOCKET_MAXSCATTERGATHER)
353 #define MAXSCATTERGATHER_RECV   (ISC_SOCKET_MAXSCATTERGATHER)
354
355 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
356                                   isc_sockettype_t type,
357                                   isc_socket_t **socketp,
358                                   isc_socket_t *dup_socket);
359 static isc_threadresult_t WINAPI SocketIoThread(LPVOID ThreadContext);
360 static void maybe_free_socket(isc_socket_t **, int);
361 static void free_socket(isc_socket_t **, int);
362 static isc_boolean_t senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev);
363 static isc_boolean_t acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev);
364 static isc_boolean_t connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev);
365 static void send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev);
366 static void send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev);
367 static void send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev);
368 static void send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev);
369 static void send_recvdone_abort(isc_socket_t *sock, isc_result_t result);
370 static void queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev);
371 static void queue_receive_request(isc_socket_t *sock);
372
373 /*
374  * This is used to dump the contents of the sock structure
375  * You should make sure that the sock is locked before
376  * dumping it. Since the code uses simple printf() statements
377  * it should only be used interactively.
378  */
379 void
380 sock_dump(isc_socket_t *sock) {
381         isc_socketevent_t *ldev;
382         isc_socket_newconnev_t *ndev;
383
384 #if 0
385         isc_sockaddr_t addr;
386         char socktext[256];
387
388         isc_socket_getpeername(sock, &addr);
389         isc_sockaddr_format(&addr, socktext, sizeof(socktext));
390         printf("Remote Socket: %s\n", socktext);
391         isc_socket_getsockname(sock, &addr);
392         isc_sockaddr_format(&addr, socktext, sizeof(socktext));
393         printf("This Socket: %s\n", socktext);
394 #endif
395
396         printf("\n\t\tSock Dump\n");
397         printf("\t\tfd: %u\n", sock->fd);
398         printf("\t\treferences: %d\n", sock->references);
399         printf("\t\tpending_accept: %d\n", sock->pending_accept);
400         printf("\t\tconnecting: %d\n", sock->pending_connect);
401         printf("\t\tconnected: %d\n", sock->connected);
402         printf("\t\tbound: %d\n", sock->bound);
403         printf("\t\tpending_iocp: %d\n", sock->pending_iocp);
404         printf("\t\tsocket type: %d\n", sock->type);
405
406         printf("\n\t\tSock Recv List\n");
407         ldev = ISC_LIST_HEAD(sock->recv_list);
408         while (ldev != NULL) {
409                 printf("\t\tdev: %p\n", ldev);
410                 ldev = ISC_LIST_NEXT(ldev, ev_link);
411         }
412
413         printf("\n\t\tSock Send List\n");
414         ldev = ISC_LIST_HEAD(sock->send_list);
415         while (ldev != NULL) {
416                 printf("\t\tdev: %p\n", ldev);
417                 ldev = ISC_LIST_NEXT(ldev, ev_link);
418         }
419
420         printf("\n\t\tSock Accept List\n");
421         ndev = ISC_LIST_HEAD(sock->accept_list);
422         while (ndev != NULL) {
423                 printf("\t\tdev: %p\n", ldev);
424                 ndev = ISC_LIST_NEXT(ndev, ev_link);
425         }
426 }
427
428 static void
429 socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
430            isc_logcategory_t *category, isc_logmodule_t *module, int level,
431            isc_msgcat_t *msgcat, int msgset, int message,
432            const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
433
434 /*  This function will add an entry to the I/O completion port
435  *  that will signal the I/O thread to exit (gracefully)
436  */
437 static void
438 signal_iocompletionport_exit(isc_socketmgr_t *manager) {
439         int i;
440         int errval;
441         char strbuf[ISC_STRERRORSIZE];
442
443         REQUIRE(VALID_MANAGER(manager));
444         for (i = 0; i < manager->maxIOCPThreads; i++) {
445                 if (!PostQueuedCompletionStatus(manager->hIoCompletionPort,
446                                                 0, 0, 0)) {
447                         errval = GetLastError();
448                         isc__strerror(errval, strbuf, sizeof(strbuf));
449                         FATAL_ERROR(__FILE__, __LINE__,
450                                 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
451                                 ISC_MSG_FAILED,
452                                 "Can't request service thread to exit: %s"),
453                                 strbuf);
454                 }
455         }
456 }
457
458 /*
459  * Create the worker threads for the I/O Completion Port
460  */
461 void
462 iocompletionport_createthreads(int total_threads, isc_socketmgr_t *manager) {
463         int errval;
464         char strbuf[ISC_STRERRORSIZE];
465         int i;
466
467         INSIST(total_threads > 0);
468         REQUIRE(VALID_MANAGER(manager));
469         /*
470          * We need at least one
471          */
472         for (i = 0; i < total_threads; i++) {
473                 manager->hIOCPThreads[i] = CreateThread(NULL, 0, SocketIoThread,
474                                                 manager, 0,
475                                                 &manager->dwIOCPThreadIds[i]);
476                 if (manager->hIOCPThreads[i] == NULL) {
477                         errval = GetLastError();
478                         isc__strerror(errval, strbuf, sizeof(strbuf));
479                         FATAL_ERROR(__FILE__, __LINE__,
480                                 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
481                                 ISC_MSG_FAILED,
482                                 "Can't create IOCP thread: %s"),
483                                 strbuf);
484                         exit(1);
485                 }
486         }
487 }
488
489 /*
490  *  Create/initialise the I/O completion port
491  */
492 void
493 iocompletionport_init(isc_socketmgr_t *manager) {
494         int errval;
495         char strbuf[ISC_STRERRORSIZE];
496
497         REQUIRE(VALID_MANAGER(manager));
498         /*
499          * Create a private heap to handle the socket overlapped structure
500          * The minimum number of structures is 10, there is no maximum
501          */
502         hHeapHandle = HeapCreate(0, 10 * sizeof(IoCompletionInfo), 0);
503         if (hHeapHandle == NULL) {
504                 errval = GetLastError();
505                 isc__strerror(errval, strbuf, sizeof(strbuf));
506                 FATAL_ERROR(__FILE__, __LINE__,
507                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
508                                            ISC_MSG_FAILED,
509                                            "HeapCreate() failed during "
510                                            "initialization: %s"),
511                             strbuf);
512                 exit(1);
513         }
514
515         manager->maxIOCPThreads = min(isc_os_ncpus() + 1, MAX_IOCPTHREADS);
516
517         /* Now Create the Completion Port */
518         manager->hIoCompletionPort = CreateIoCompletionPort(
519                         INVALID_HANDLE_VALUE, NULL,
520                         0, manager->maxIOCPThreads);
521         if (manager->hIoCompletionPort == NULL) {
522                 errval = GetLastError();
523                 isc__strerror(errval, strbuf, sizeof(strbuf));
524                 FATAL_ERROR(__FILE__, __LINE__,
525                                 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
526                                 ISC_MSG_FAILED,
527                                 "CreateIoCompletionPort() failed "
528                                 "during initialization: %s"),
529                                 strbuf);
530                 exit(1);
531         }
532
533         /*
534          * Worker threads for servicing the I/O
535          */
536         iocompletionport_createthreads(manager->maxIOCPThreads, manager);
537 }
538
539 /*
540  * Associate a socket with an IO Completion Port.  This allows us to queue events for it
541  * and have our worker pool of threads process them.
542  */
543 void
544 iocompletionport_update(isc_socket_t *sock) {
545         HANDLE hiocp;
546         char strbuf[ISC_STRERRORSIZE];
547
548         REQUIRE(VALID_SOCKET(sock));
549
550         hiocp = CreateIoCompletionPort((HANDLE)sock->fd,
551                 sock->manager->hIoCompletionPort, (ULONG_PTR)sock, 0);
552
553         if (hiocp == NULL) {
554                 DWORD errval = GetLastError();
555                 isc__strerror(errval, strbuf, sizeof(strbuf));
556                 isc_log_iwrite(isc_lctx,
557                                 ISC_LOGCATEGORY_GENERAL,
558                                 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
559                                 isc_msgcat, ISC_MSGSET_SOCKET,
560                                 ISC_MSG_TOOMANYHANDLES,
561                                 "iocompletionport_update: failed to open"
562                                 " io completion port: %s",
563                                 strbuf);
564
565                 /* XXXMLG temporary hack to make failures detected.
566                  * This function should return errors to the caller, not
567                  * exit here.
568                  */
569                 FATAL_ERROR(__FILE__, __LINE__,
570                                 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
571                                 ISC_MSG_FAILED,
572                                 "CreateIoCompletionPort() failed "
573                                 "during initialization: %s"),
574                                 strbuf);
575                 exit(1);
576         }
577
578         InterlockedIncrement(&sock->manager->iocp_total);
579 }
580
581 /*
582  * Routine to cleanup and then close the socket.
583  * Only close the socket here if it is NOT associated
584  * with an event, otherwise the WSAWaitForMultipleEvents
585  * may fail due to the fact that the Wait should not
586  * be running while closing an event or a socket.
587  * The socket is locked before calling this function
588  */
589 void
590 socket_close(isc_socket_t *sock) {
591
592         REQUIRE(sock != NULL);
593
594         if (sock->fd != INVALID_SOCKET) {
595                 closesocket(sock->fd);
596                 sock->fd = INVALID_SOCKET;
597                 _set_state(sock, SOCK_CLOSED);
598                 InterlockedDecrement(&sock->manager->totalSockets);
599         }
600 }
601
602 static isc_once_t initialise_once = ISC_ONCE_INIT;
603 static isc_boolean_t initialised = ISC_FALSE;
604
605 static void
606 initialise(void) {
607         WORD wVersionRequested;
608         WSADATA wsaData;
609         int err;
610         SOCKET sock;
611         GUID GUIDConnectEx = WSAID_CONNECTEX;
612         GUID GUIDAcceptEx = WSAID_ACCEPTEX;
613         GUID GUIDGetAcceptExSockaddrs = WSAID_GETACCEPTEXSOCKADDRS;
614         DWORD dwBytes;
615
616         /* Need Winsock 2.2 or better */
617         wVersionRequested = MAKEWORD(2, 2);
618
619         err = WSAStartup(wVersionRequested, &wsaData);
620         if (err != 0) {
621                 char strbuf[ISC_STRERRORSIZE];
622                 isc__strerror(err, strbuf, sizeof(strbuf));
623                 FATAL_ERROR(__FILE__, __LINE__, "WSAStartup() %s: %s",
624                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
625                                            ISC_MSG_FAILED, "failed"),
626                             strbuf);
627                 exit(1);
628         }
629         /*
630          * The following APIs do not exist as functions in a library, but we must
631          * ask winsock for them.  They are "extensions" -- but why they cannot be
632          * actual functions is beyond me.  So, ask winsock for the pointers to the
633          * functions we need.
634          */
635         sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
636         INSIST(sock != INVALID_SOCKET);
637         err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
638                  &GUIDConnectEx, sizeof(GUIDConnectEx),
639                  &ISCConnectEx, sizeof(ISCConnectEx),
640                  &dwBytes, NULL, NULL);
641         INSIST(err == 0);
642
643         err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
644                  &GUIDAcceptEx, sizeof(GUIDAcceptEx),
645                  &ISCAcceptEx, sizeof(ISCAcceptEx),
646                  &dwBytes, NULL, NULL);
647         INSIST(err == 0);
648
649         err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
650                  &GUIDGetAcceptExSockaddrs, sizeof(GUIDGetAcceptExSockaddrs),
651                  &ISCGetAcceptExSockaddrs, sizeof(ISCGetAcceptExSockaddrs),
652                  &dwBytes, NULL, NULL);
653         INSIST(err == 0);
654
655         closesocket(sock);
656
657         initialised = ISC_TRUE;
658 }
659
660 /*
661  * Initialize socket services
662  */
663 void
664 InitSockets(void) {
665         RUNTIME_CHECK(isc_once_do(&initialise_once,
666                                   initialise) == ISC_R_SUCCESS);
667         if (!initialised)
668                 exit(1);
669 }
670
671 int
672 internal_sendmsg(isc_socket_t *sock, IoCompletionInfo *lpo,
673                  struct msghdr *messagehdr, int flags, int *Error)
674 {
675         int Result;
676         DWORD BytesSent;
677         DWORD Flags = flags;
678         int total_sent;
679
680         *Error = 0;
681         Result = WSASendTo(sock->fd, messagehdr->msg_iov,
682                            messagehdr->msg_iovlen, &BytesSent,
683                            Flags, (SOCKADDR *)&messagehdr->to_addr,
684                            messagehdr->to_addr_len, (LPWSAOVERLAPPED)lpo,
685                            NULL);
686
687         total_sent = (int)BytesSent;
688
689         /* Check for errors.*/
690         if (Result == SOCKET_ERROR) {
691                 *Error = WSAGetLastError();
692
693                 switch (*Error) {
694                 case WSA_IO_INCOMPLETE:
695                 case WSA_WAIT_IO_COMPLETION:
696                 case WSA_IO_PENDING:
697                 case NO_ERROR:          /* Strange, but okay */
698                         sock->pending_iocp++;
699                         sock->pending_send++;
700                         break;
701
702                 default:
703                         return (-1);
704                         break;
705                 }
706         } else {
707                 sock->pending_iocp++;
708                 sock->pending_send++;
709         }
710
711         if (lpo != NULL)
712                 return (0);
713         else
714                 return (total_sent);
715 }
716
717 static void
718 queue_receive_request(isc_socket_t *sock) {
719         DWORD Flags = 0;
720         DWORD NumBytes = 0;
721         int total_bytes = 0;
722         int Result;
723         int Error;
724         int need_retry;
725         WSABUF iov[1];
726         IoCompletionInfo *lpo = NULL;
727         isc_result_t isc_result;
728
729  retry:
730         need_retry = ISC_FALSE;
731
732         /*
733          * If we already have a receive pending, do nothing.
734          */
735         if (sock->pending_recv > 0) {
736                 if (lpo != NULL)
737                         HeapFree(hHeapHandle, 0, lpo);
738                 return;
739         }
740
741         /*
742          * If no one is waiting, do nothing.
743          */
744         if (ISC_LIST_EMPTY(sock->recv_list)) {
745                 if (lpo != NULL)
746                         HeapFree(hHeapHandle, 0, lpo);
747                 return;
748         }
749
750         INSIST(sock->recvbuf.remaining == 0);
751         INSIST(sock->fd != INVALID_SOCKET);
752
753         iov[0].len = sock->recvbuf.len;
754         iov[0].buf = sock->recvbuf.base;
755
756         if (lpo == NULL) {
757                 lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
758                                                     HEAP_ZERO_MEMORY,
759                                                     sizeof(IoCompletionInfo));
760                 RUNTIME_CHECK(lpo != NULL);
761         } else
762                 ZeroMemory(lpo, sizeof(IoCompletionInfo));
763         lpo->request_type = SOCKET_RECV;
764
765         sock->recvbuf.from_addr_len = sizeof(sock->recvbuf.from_addr);
766
767         Error = 0;
768         Result = WSARecvFrom((SOCKET)sock->fd, iov, 1,
769                              &NumBytes, &Flags,
770                              (SOCKADDR *)&sock->recvbuf.from_addr,
771                              &sock->recvbuf.from_addr_len,
772                              (LPWSAOVERLAPPED)lpo, NULL);
773
774         /* Check for errors. */
775         if (Result == SOCKET_ERROR) {
776                 Error = WSAGetLastError();
777
778                 switch (Error) {
779                 case WSA_IO_PENDING:
780                         sock->pending_iocp++;
781                         sock->pending_recv++;
782                         break;
783
784                 /* direct error: no completion event */
785                 case ERROR_HOST_UNREACHABLE:
786                 case WSAENETRESET:
787                 case WSAECONNRESET:
788                         if (!sock->connected) {
789                                 /* soft error */
790                                 need_retry = ISC_TRUE;
791                                 break;
792                         }
793                         /* FALLTHROUGH */
794
795                 default:
796                         isc_result = isc__errno2result(Error);
797                         if (isc_result == ISC_R_UNEXPECTED)
798                                 UNEXPECTED_ERROR(__FILE__, __LINE__,
799                                         "WSARecvFrom: Windows error code: %d, isc result %d",
800                                         Error, isc_result);
801                         send_recvdone_abort(sock, isc_result);
802                         HeapFree(hHeapHandle, 0, lpo);
803                         lpo = NULL;
804                         break;
805                 }
806         } else {
807                 /*
808                  * The recv() finished immediately, but we will still get
809                  * a completion event.  Rather than duplicate code, let
810                  * that thread handle sending the data along its way.
811                  */
812                 sock->pending_iocp++;
813                 sock->pending_recv++;
814         }
815
816         socket_log(__LINE__, sock, NULL, IOEVENT,
817                    isc_msgcat, ISC_MSGSET_SOCKET,
818                    ISC_MSG_DOIORECV,
819                    "queue_io_request: fd %d result %d error %d",
820                    sock->fd, Result, Error);
821
822         CONSISTENT(sock);
823
824         if (need_retry)
825                 goto retry;
826 }
827
828 static void
829 manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
830             isc_logmodule_t *module, int level, const char *fmt, ...)
831 {
832         char msgbuf[2048];
833         va_list ap;
834
835         if (!isc_log_wouldlog(isc_lctx, level))
836                 return;
837
838         va_start(ap, fmt);
839         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
840         va_end(ap);
841
842         isc_log_write(isc_lctx, category, module, level,
843                       "sockmgr %p: %s", sockmgr, msgbuf);
844 }
845
846 static void
847 socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
848            isc_logcategory_t *category, isc_logmodule_t *module, int level,
849            isc_msgcat_t *msgcat, int msgset, int message,
850            const char *fmt, ...)
851 {
852         char msgbuf[2048];
853         char peerbuf[256];
854         va_list ap;
855
856
857         if (!isc_log_wouldlog(isc_lctx, level))
858                 return;
859
860         va_start(ap, fmt);
861         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
862         va_end(ap);
863
864         if (address == NULL) {
865                 isc_log_iwrite(isc_lctx, category, module, level,
866                                msgcat, msgset, message,
867                                "socket %p line %d: %s", sock, lineno, msgbuf);
868         } else {
869                 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
870                 isc_log_iwrite(isc_lctx, category, module, level,
871                                msgcat, msgset, message,
872                                    "socket %p line %d peer %s: %s", sock, lineno,
873                                    peerbuf, msgbuf);
874         }
875
876 }
877
878 /*
879  * Make an fd SOCKET non-blocking.
880  */
881 static isc_result_t
882 make_nonblock(SOCKET fd) {
883         int ret;
884         unsigned long flags = 1;
885         char strbuf[ISC_STRERRORSIZE];
886
887         /* Set the socket to non-blocking */
888         ret = ioctlsocket(fd, FIONBIO, &flags);
889
890         if (ret == -1) {
891                 isc__strerror(errno, strbuf, sizeof(strbuf));
892                 UNEXPECTED_ERROR(__FILE__, __LINE__,
893                                  "ioctlsocket(%d, FIOBIO, %d): %s",
894                                  fd, flags, strbuf);
895
896                 return (ISC_R_UNEXPECTED);
897         }
898
899         return (ISC_R_SUCCESS);
900 }
901
902 /*
903  * Windows 2000 systems incorrectly cause UDP sockets using WSARecvFrom
904  * to not work correctly, returning a WSACONNRESET error when a WSASendTo
905  * fails with an "ICMP port unreachable" response and preventing the
906  * socket from using the WSARecvFrom in subsequent operations.
907  * The function below fixes this, but requires that Windows 2000
908  * Service Pack 2 or later be installed on the system.  NT 4.0
909  * systems are not affected by this and work correctly.
910  * See Microsoft Knowledge Base Article Q263823 for details of this.
911  */
912 isc_result_t
913 connection_reset_fix(SOCKET fd) {
914         DWORD dwBytesReturned = 0;
915         BOOL  bNewBehavior = FALSE;
916         DWORD status;
917
918         if (isc_win32os_majorversion() < 5)
919                 return (ISC_R_SUCCESS); /*  NT 4.0 has no problem */
920
921         /* disable bad behavior using IOCTL: SIO_UDP_CONNRESET */
922         status = WSAIoctl(fd, SIO_UDP_CONNRESET, &bNewBehavior,
923                           sizeof(bNewBehavior), NULL, 0,
924                           &dwBytesReturned, NULL, NULL);
925         if (status != SOCKET_ERROR)
926                 return (ISC_R_SUCCESS);
927         else {
928                 UNEXPECTED_ERROR(__FILE__, __LINE__,
929                                  "WSAIoctl(SIO_UDP_CONNRESET, oldBehaviour) %s",
930                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
931                                                 ISC_MSG_FAILED, "failed"));
932                 return (ISC_R_UNEXPECTED);
933         }
934 }
935
936 /*
937  * Construct an iov array and attach it to the msghdr passed in.  This is
938  * the SEND constructor, which will use the used region of the buffer
939  * (if using a buffer list) or will use the internal region (if a single
940  * buffer I/O is requested).
941  *
942  * Nothing can be NULL, and the done event must list at least one buffer
943  * on the buffer linked list for this function to be meaningful.
944  */
945 static void
946 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
947                   struct msghdr *msg, char *cmsg, WSABUF *iov,
948                   IoCompletionInfo  *lpo)
949 {
950         unsigned int iovcount;
951         isc_buffer_t *buffer;
952         buflist_t  *cpbuffer;
953         isc_region_t used;
954         size_t write_count;
955         size_t skip_count;
956
957         memset(msg, 0, sizeof(*msg));
958
959         memcpy(&msg->to_addr, &dev->address.type, dev->address.length);
960         msg->to_addr_len = dev->address.length;
961
962         buffer = ISC_LIST_HEAD(dev->bufferlist);
963         write_count = 0;
964         iovcount = 0;
965
966         /*
967          * Single buffer I/O?  Skip what we've done so far in this region.
968          */
969         if (buffer == NULL) {
970                 write_count = dev->region.length - dev->n;
971                 cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
972                 RUNTIME_CHECK(cpbuffer != NULL);
973                 cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, write_count);
974                 RUNTIME_CHECK(cpbuffer->buf != NULL);
975
976                 socket_log(__LINE__, sock, NULL, TRACE,
977                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
978                    "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
979                    cpbuffer->buf, write_count);
980
981                 memcpy(cpbuffer->buf,(dev->region.base + dev->n), write_count);
982                 cpbuffer->buflen = write_count;
983                 ISC_LIST_ENQUEUE(lpo->bufferlist, cpbuffer, link);
984                 iov[0].buf = cpbuffer->buf;
985                 iov[0].len = write_count;
986                 iovcount = 1;
987
988                 goto config;
989         }
990
991         /*
992          * Multibuffer I/O.
993          * Skip the data in the buffer list that we have already written.
994          */
995         skip_count = dev->n;
996         while (buffer != NULL) {
997                 REQUIRE(ISC_BUFFER_VALID(buffer));
998                 if (skip_count < isc_buffer_usedlength(buffer))
999                         break;
1000                 skip_count -= isc_buffer_usedlength(buffer);
1001                 buffer = ISC_LIST_NEXT(buffer, link);
1002         }
1003
1004         while (buffer != NULL) {
1005                 INSIST(iovcount < MAXSCATTERGATHER_SEND);
1006
1007                 isc_buffer_usedregion(buffer, &used);
1008
1009                 if (used.length > 0) {
1010                         int uselen = used.length - skip_count;
1011                         cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
1012                         RUNTIME_CHECK(cpbuffer != NULL);
1013                         cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, uselen);
1014                         RUNTIME_CHECK(cpbuffer->buf != NULL);
1015
1016                         socket_log(__LINE__, sock, NULL, TRACE,
1017                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1018                            "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
1019                            cpbuffer->buf, write_count);
1020
1021                         memcpy(cpbuffer->buf,(used.base + skip_count), uselen);
1022                         cpbuffer->buflen = uselen;
1023                         iov[iovcount].buf = cpbuffer->buf;
1024                         iov[iovcount].len = used.length - skip_count;
1025                         write_count += uselen;
1026                         skip_count = 0;
1027                         iovcount++;
1028                 }
1029                 buffer = ISC_LIST_NEXT(buffer, link);
1030         }
1031
1032         INSIST(skip_count == 0);
1033
1034  config:
1035         msg->msg_iov = iov;
1036         msg->msg_iovlen = iovcount;
1037         msg->msg_totallen = write_count;
1038 }
1039
1040 static void
1041 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1042                 isc_socketevent_t *dev)
1043 {
1044         if (sock->type == isc_sockettype_udp) {
1045                 if (address != NULL)
1046                         dev->address = *address;
1047                 else
1048                         dev->address = sock->address;
1049         } else if (sock->type == isc_sockettype_tcp) {
1050                 INSIST(address == NULL);
1051                 dev->address = sock->address;
1052         }
1053 }
1054
1055 static void
1056 destroy_socketevent(isc_event_t *event) {
1057         isc_socketevent_t *ev = (isc_socketevent_t *)event;
1058
1059         INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1060
1061         (ev->destroy)(event);
1062 }
1063
1064 static isc_socketevent_t *
1065 allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1066                      isc_taskaction_t action, const void *arg)
1067 {
1068         isc_socketevent_t *ev;
1069
1070         ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1071                                                      sock, eventtype,
1072                                                      action, arg,
1073                                                      sizeof(*ev));
1074         if (ev == NULL)
1075                 return (NULL);
1076
1077         ev->result = ISC_R_IOERROR; // XXXMLG temporary change to detect failure to set
1078         ISC_LINK_INIT(ev, ev_link);
1079         ISC_LIST_INIT(ev->bufferlist);
1080         ev->region.base = NULL;
1081         ev->n = 0;
1082         ev->offset = 0;
1083         ev->attributes = 0;
1084         ev->destroy = ev->ev_destroy;
1085         ev->ev_destroy = destroy_socketevent;
1086
1087         return (ev);
1088 }
1089
1090 #if defined(ISC_SOCKET_DEBUG)
1091 static void
1092 dump_msg(struct msghdr *msg, isc_socket_t *sock) {
1093         unsigned int i;
1094
1095         printf("MSGHDR %p, Socket #: %u\n", msg, sock->fd);
1096         printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
1097         printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
1098         for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1099                 printf("\t\t%d\tbase %p, len %d\n", i,
1100                        msg->msg_iov[i].buf,
1101                        msg->msg_iov[i].len);
1102 }
1103 #endif
1104
1105 /*
1106  * map the error code
1107  */
1108 int
1109 map_socket_error(isc_socket_t *sock, int windows_errno, int *isc_errno,
1110                  char *errorstring, size_t bufsize) {
1111
1112         int doreturn;
1113         switch (windows_errno) {
1114         case WSAECONNREFUSED:
1115                 *isc_errno = ISC_R_CONNREFUSED;
1116                 if (sock->connected)
1117                         doreturn = DOIO_HARD;
1118                 else
1119                         doreturn = DOIO_SOFT;
1120                 break;
1121         case WSAENETUNREACH:
1122         case ERROR_NETWORK_UNREACHABLE:
1123                 *isc_errno = ISC_R_NETUNREACH;
1124                 if (sock->connected)
1125                         doreturn = DOIO_HARD;
1126                 else
1127                         doreturn = DOIO_SOFT;
1128                 break;
1129         case ERROR_PORT_UNREACHABLE:
1130         case ERROR_HOST_UNREACHABLE:
1131         case WSAEHOSTUNREACH:
1132                 *isc_errno = ISC_R_HOSTUNREACH;
1133                 if (sock->connected)
1134                         doreturn = DOIO_HARD;
1135                 else
1136                         doreturn = DOIO_SOFT;
1137                 break;
1138         case WSAENETDOWN:
1139                 *isc_errno = ISC_R_NETDOWN;
1140                 if (sock->connected)
1141                         doreturn = DOIO_HARD;
1142                 else
1143                         doreturn = DOIO_SOFT;
1144                 break;
1145         case WSAEHOSTDOWN:
1146                 *isc_errno = ISC_R_HOSTDOWN;
1147                 if (sock->connected)
1148                         doreturn = DOIO_HARD;
1149                 else
1150                         doreturn = DOIO_SOFT;
1151                 break;
1152         case WSAEACCES:
1153                 *isc_errno = ISC_R_NOPERM;
1154                 if (sock->connected)
1155                         doreturn = DOIO_HARD;
1156                 else
1157                         doreturn = DOIO_SOFT;
1158                 break;
1159         case WSAECONNRESET:
1160         case WSAENETRESET:
1161         case WSAECONNABORTED:
1162         case WSAEDISCON:
1163                 *isc_errno = ISC_R_CONNECTIONRESET;
1164                 if (sock->connected)
1165                         doreturn = DOIO_HARD;
1166                 else
1167                         doreturn = DOIO_SOFT;
1168                 break;
1169         case WSAENOTCONN:
1170                 *isc_errno = ISC_R_NOTCONNECTED;
1171                 if (sock->connected)
1172                         doreturn = DOIO_HARD;
1173                 else
1174                         doreturn = DOIO_SOFT;
1175                 break;
1176         case ERROR_OPERATION_ABORTED:
1177         case ERROR_CONNECTION_ABORTED:
1178         case ERROR_REQUEST_ABORTED:
1179                 *isc_errno = ISC_R_CONNECTIONRESET;
1180                 doreturn = DOIO_HARD;
1181                 break;
1182         case WSAENOBUFS:
1183                 *isc_errno = ISC_R_NORESOURCES;
1184                 doreturn = DOIO_HARD;
1185                 break;
1186         case WSAEAFNOSUPPORT:
1187                 *isc_errno = ISC_R_FAMILYNOSUPPORT;
1188                 doreturn = DOIO_HARD;
1189                 break;
1190         case WSAEADDRNOTAVAIL:
1191                 *isc_errno = ISC_R_ADDRNOTAVAIL;
1192                 doreturn = DOIO_HARD;
1193                 break;
1194         case WSAEDESTADDRREQ:
1195                 *isc_errno = ISC_R_BADADDRESSFORM;
1196                 doreturn = DOIO_HARD;
1197                 break;
1198         case ERROR_NETNAME_DELETED:
1199                 *isc_errno = ISC_R_NETDOWN;
1200                 doreturn = DOIO_HARD;
1201                 break;
1202         default:
1203                 *isc_errno = ISC_R_IOERROR;
1204                 doreturn = DOIO_HARD;
1205                 break;
1206         }
1207         if (doreturn == DOIO_HARD) {
1208                 isc__strerror(windows_errno, errorstring, bufsize);
1209         }
1210         return (doreturn);
1211 }
1212
1213 static void
1214 fill_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1215         isc_region_t r;
1216         int copylen;
1217         isc_buffer_t *buffer;
1218
1219         INSIST(dev->n < dev->minimum);
1220         INSIST(sock->recvbuf.remaining > 0);
1221         INSIST(sock->pending_recv == 0);
1222
1223         if (sock->type == isc_sockettype_udp) {
1224                 dev->address.length = sock->recvbuf.from_addr_len;
1225                 memcpy(&dev->address.type, &sock->recvbuf.from_addr,
1226                     sock->recvbuf.from_addr_len);
1227                 if (isc_sockaddr_getport(&dev->address) == 0) {
1228                         if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1229                                 socket_log(__LINE__, sock, &dev->address, IOEVENT,
1230                                            isc_msgcat, ISC_MSGSET_SOCKET,
1231                                            ISC_MSG_ZEROPORT,
1232                                            "dropping source port zero packet");
1233                         }
1234                         sock->recvbuf.remaining = 0;
1235                         return;
1236                 }
1237         } else if (sock->type == isc_sockettype_tcp) {
1238                 dev->address = sock->address;
1239         }
1240
1241         /*
1242          * Run through the list of buffers we were given, and find the
1243          * first one with space.  Once it is found, loop through, filling
1244          * the buffers as much as possible.
1245          */
1246         buffer = ISC_LIST_HEAD(dev->bufferlist);
1247         if (buffer != NULL) { // Multi-buffer receive
1248                 while (buffer != NULL && sock->recvbuf.remaining > 0) {
1249                         REQUIRE(ISC_BUFFER_VALID(buffer));
1250                         if (isc_buffer_availablelength(buffer) > 0) {
1251                                 isc_buffer_availableregion(buffer, &r);
1252                                 copylen = min(r.length, sock->recvbuf.remaining);
1253                                 memcpy(r.base, sock->recvbuf.consume_position, copylen);
1254                                 sock->recvbuf.consume_position += copylen;
1255                                 sock->recvbuf.remaining -= copylen;
1256                                 isc_buffer_add(buffer, copylen);
1257                                 dev->n += copylen;
1258                         }
1259                         buffer = ISC_LIST_NEXT(buffer, link);
1260                 }
1261         } else { // Single-buffer receive
1262                 copylen = min(dev->region.length - dev->n, sock->recvbuf.remaining);
1263                 memcpy(dev->region.base + dev->n, sock->recvbuf.consume_position, copylen);
1264                 sock->recvbuf.consume_position += copylen;
1265                 sock->recvbuf.remaining -= copylen;
1266                 dev->n += copylen;
1267         }
1268
1269         /*
1270          * UDP receives are all-consuming.  That is, if we have 4k worth of
1271          * data in our receive buffer, and the caller only gave us
1272          * 1k of space, we will toss the remaining 3k of data.  TCP
1273          * will keep the extra data around and use it for later requests.
1274          */
1275         if (sock->type == isc_sockettype_udp)
1276                 sock->recvbuf.remaining = 0;
1277 }
1278
1279 /*
1280  * Copy out as much data from the internal buffer to done events.
1281  * As each done event is filled, send it along its way.
1282  */
1283 static void
1284 completeio_recv(isc_socket_t *sock)
1285 {
1286         isc_socketevent_t *dev;
1287
1288         /*
1289          * If we are in the process of filling our buffer, we cannot
1290          * touch it yet, so don't.
1291          */
1292         if (sock->pending_recv > 0)
1293                 return;
1294
1295         while (sock->recvbuf.remaining > 0 && !ISC_LIST_EMPTY(sock->recv_list)) {
1296                 dev = ISC_LIST_HEAD(sock->recv_list);
1297
1298                 /*
1299                  * See if we have sufficient data in our receive buffer
1300                  * to handle this.  If we do, copy out the data.
1301                  */
1302                 fill_recv(sock, dev);
1303
1304                 /*
1305                  * Did we satisfy it?
1306                  */
1307                 if (dev->n >= dev->minimum) {
1308                         dev->result = ISC_R_SUCCESS;
1309                         send_recvdone_event(sock, &dev);
1310                 }
1311         }
1312 }
1313
1314 /*
1315  * Returns:
1316  *      DOIO_SUCCESS    The operation succeeded.  dev->result contains
1317  *                      ISC_R_SUCCESS.
1318  *
1319  *      DOIO_HARD       A hard or unexpected I/O error was encountered.
1320  *                      dev->result contains the appropriate error.
1321  *
1322  *      DOIO_SOFT       A soft I/O error was encountered.  No senddone
1323  *                      event was sent.  The operation should be retried.
1324  *
1325  *      No other return values are possible.
1326  */
1327 static int
1328 completeio_send(isc_socket_t *sock, isc_socketevent_t *dev,
1329                 struct msghdr *messagehdr, int cc, int send_errno)
1330 {
1331         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1332         char strbuf[ISC_STRERRORSIZE];
1333
1334         if (send_errno != 0) {
1335                 if (SOFT_ERROR(send_errno))
1336                         return (DOIO_SOFT);
1337
1338                 return (map_socket_error(sock, send_errno, &dev->result,
1339                         strbuf, sizeof(strbuf)));
1340
1341                 /*
1342                  * The other error types depend on whether or not the
1343                  * socket is UDP or TCP.  If it is UDP, some errors
1344                  * that we expect to be fatal under TCP are merely
1345                  * annoying, and are really soft errors.
1346                  *
1347                  * However, these soft errors are still returned as
1348                  * a status.
1349                  */
1350                 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1351                 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1352                 UNEXPECTED_ERROR(__FILE__, __LINE__, "completeio_send: %s: %s",
1353                                  addrbuf, strbuf);
1354                 dev->result = isc__errno2result(send_errno);
1355                 return (DOIO_HARD);
1356         }
1357
1358         /*
1359          * If we write less than we expected, update counters, poke.
1360          */
1361         dev->n += cc;
1362         if (cc != messagehdr->msg_totallen)
1363                 return (DOIO_SOFT);
1364
1365         /*
1366          * Exactly what we wanted to write.  We're done with this
1367          * entry.  Post its completion event.
1368          */
1369         dev->result = ISC_R_SUCCESS;
1370         return (DOIO_SUCCESS);
1371 }
1372
1373 static int
1374 startio_send(isc_socket_t *sock, isc_socketevent_t *dev, int *nbytes,
1375              int *send_errno)
1376 {
1377         char *cmsg = NULL;
1378         char strbuf[ISC_STRERRORSIZE];
1379         IoCompletionInfo *lpo;
1380         int status;
1381         struct msghdr *msghdr;
1382
1383         lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
1384                                             HEAP_ZERO_MEMORY,
1385                                             sizeof(IoCompletionInfo));
1386         RUNTIME_CHECK(lpo != NULL);
1387         lpo->request_type = SOCKET_SEND;
1388         lpo->dev = dev;
1389         msghdr = &lpo->messagehdr;
1390         memset(msghdr, 0, sizeof(struct msghdr));
1391         ISC_LIST_INIT(lpo->bufferlist);
1392
1393         build_msghdr_send(sock, dev, msghdr, cmsg, sock->iov, lpo);
1394
1395         *nbytes = internal_sendmsg(sock, lpo, msghdr, 0, send_errno);
1396
1397         if (*nbytes < 0) {
1398                 /*
1399                  * I/O has been initiated
1400                  * completion will be through the completion port
1401                  */
1402                 if (PENDING_ERROR(*send_errno)) {
1403                         status = DOIO_PENDING;
1404                         goto done;
1405                 }
1406
1407                 if (SOFT_ERROR(*send_errno)) {
1408                         status = DOIO_SOFT;
1409                         goto done;
1410                 }
1411
1412                 /*
1413                  * If we got this far then something is wrong
1414                  */
1415                 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1416                         isc__strerror(*send_errno, strbuf, sizeof(strbuf));
1417                         socket_log(__LINE__, sock, NULL, IOEVENT,
1418                                    isc_msgcat, ISC_MSGSET_SOCKET,
1419                                    ISC_MSG_INTERNALSEND,
1420                                    "startio_send: internal_sendmsg(%d) %d "
1421                                    "bytes, err %d/%s",
1422                                    sock->fd, *nbytes, *send_errno, strbuf);
1423                 }
1424                 status = DOIO_HARD;
1425                 goto done;
1426         }
1427         dev->result = ISC_R_SUCCESS;
1428         status = DOIO_SOFT;
1429  done:
1430         _set_state(sock, SOCK_DATA);
1431         return (status);
1432 }
1433
1434 static isc_result_t
1435 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1436                 isc_socket_t **socketp) {
1437         isc_socket_t *sock;
1438         isc_result_t result;
1439
1440         sock = isc_mem_get(manager->mctx, sizeof(*sock));
1441
1442         if (sock == NULL)
1443                 return (ISC_R_NOMEMORY);
1444
1445         sock->magic = 0;
1446         sock->references = 0;
1447
1448         sock->manager = manager;
1449         sock->type = type;
1450         sock->fd = INVALID_SOCKET;
1451
1452         ISC_LINK_INIT(sock, link);
1453
1454         /*
1455          * set up list of readers and writers to be initially empty
1456          */
1457         ISC_LIST_INIT(sock->recv_list);
1458         ISC_LIST_INIT(sock->send_list);
1459         ISC_LIST_INIT(sock->accept_list);
1460         sock->connect_ev = NULL;
1461         sock->pending_accept = 0;
1462         sock->pending_recv = 0;
1463         sock->pending_send = 0;
1464         sock->pending_iocp = 0;
1465         sock->listener = 0;
1466         sock->connected = 0;
1467         sock->pending_connect = 0;
1468         sock->bound = 0;
1469         sock->dupped = 0;
1470         memset(sock->name, 0, sizeof(sock->name));      // zero the name field
1471         _set_state(sock, SOCK_INITIALIZED);
1472
1473         sock->recvbuf.len = 65536;
1474         sock->recvbuf.consume_position = sock->recvbuf.base;
1475         sock->recvbuf.remaining = 0;
1476         sock->recvbuf.base = isc_mem_get(manager->mctx, sock->recvbuf.len); // max buffer size
1477         if (sock->recvbuf.base == NULL) {
1478                 sock->magic = 0;
1479                 goto error;
1480         }
1481
1482         /*
1483          * initialize the lock
1484          */
1485         result = isc_mutex_init(&sock->lock);
1486         if (result != ISC_R_SUCCESS) {
1487                 sock->magic = 0;
1488                 isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1489                 sock->recvbuf.base = NULL;
1490                 goto error;
1491         }
1492
1493         socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1494                    "allocated");
1495
1496         sock->magic = SOCKET_MAGIC;
1497         *socketp = sock;
1498
1499         return (ISC_R_SUCCESS);
1500
1501  error:
1502         isc_mem_put(manager->mctx, sock, sizeof(*sock));
1503
1504         return (result);
1505 }
1506
1507 /*
1508  * Verify that the socket state is consistent.
1509  */
1510 static void
1511 consistent(isc_socket_t *sock) {
1512
1513         isc_socketevent_t *dev;
1514         isc_socket_newconnev_t *nev;
1515         unsigned int count;
1516         char *crash_reason;
1517         isc_boolean_t crash = ISC_FALSE;
1518
1519         REQUIRE(sock->pending_iocp == sock->pending_recv + sock->pending_send
1520                 + sock->pending_accept + sock->pending_connect);
1521
1522         dev = ISC_LIST_HEAD(sock->send_list);
1523         count = 0;
1524         while (dev != NULL) {
1525                 count++;
1526                 dev = ISC_LIST_NEXT(dev, ev_link);
1527         }
1528         if (count > sock->pending_send) {
1529                 crash = ISC_TRUE;
1530                 crash_reason = "send_list > sock->pending_send";
1531         }
1532
1533         nev = ISC_LIST_HEAD(sock->accept_list);
1534         count = 0;
1535         while (nev != NULL) {
1536                 count++;
1537                 nev = ISC_LIST_NEXT(nev, ev_link);
1538         }
1539         if (count > sock->pending_accept) {
1540                 crash = ISC_TRUE;
1541                 crash_reason = "send_list > sock->pending_send";
1542         }
1543
1544         if (crash) {
1545                 socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1546                            ISC_MSG_DESTROYING, "SOCKET INCONSISTENT: %s",
1547                            crash_reason);
1548                 sock_dump(sock);
1549                 INSIST(crash == ISC_FALSE);
1550         }
1551 }
1552
1553 /*
1554  * Maybe free the socket.
1555  *
1556  * This function will verify tht the socket is no longer in use in any way,
1557  * either internally or externally.  This is the only place where this
1558  * check is to be made; if some bit of code believes that IT is done with
1559  * the socket (e.g., some reference counter reaches zero), it should call
1560  * this function.
1561  *
1562  * When calling this function, the socket must be locked, and the manager
1563  * must be unlocked.
1564  *
1565  * When this function returns, *socketp will be NULL.  No tricks to try
1566  * to hold on to this pointer are allowed.
1567  */
1568 static void
1569 maybe_free_socket(isc_socket_t **socketp, int lineno) {
1570         isc_socket_t *sock = *socketp;
1571         *socketp = NULL;
1572
1573         INSIST(VALID_SOCKET(sock));
1574         CONSISTENT(sock);
1575
1576         if (sock->pending_iocp > 0
1577             || sock->pending_recv > 0
1578             || sock->pending_send > 0
1579             || sock->pending_accept > 0
1580             || sock->references > 0
1581             || sock->pending_connect == 1
1582             || !ISC_LIST_EMPTY(sock->recv_list)
1583             || !ISC_LIST_EMPTY(sock->send_list)
1584             || !ISC_LIST_EMPTY(sock->accept_list)
1585             || sock->fd != INVALID_SOCKET) {
1586                 UNLOCK(&sock->lock);
1587                 return;
1588         }
1589         UNLOCK(&sock->lock);
1590
1591         free_socket(&sock, lineno);
1592 }
1593
1594 void
1595 free_socket(isc_socket_t **sockp, int lineno) {
1596         isc_socketmgr_t *manager;
1597         isc_socket_t *sock = *sockp;
1598         *sockp = NULL;
1599
1600         manager = sock->manager;
1601
1602         /*
1603          * Seems we can free the socket after all.
1604          */
1605         manager = sock->manager;
1606         socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1607                    ISC_MSG_DESTROYING, "freeing socket line %d fd %d lock %p semaphore %p",
1608                    lineno, sock->fd, &sock->lock, sock->lock.LockSemaphore);
1609
1610         sock->magic = 0;
1611         DESTROYLOCK(&sock->lock);
1612
1613         if (sock->recvbuf.base != NULL)
1614                 isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1615
1616         LOCK(&manager->lock);
1617         if (ISC_LINK_LINKED(sock, link))
1618                 ISC_LIST_UNLINK(manager->socklist, sock, link);
1619         isc_mem_put(manager->mctx, sock, sizeof(*sock));
1620
1621         if (ISC_LIST_EMPTY(manager->socklist))
1622                 SIGNAL(&manager->shutdown_ok);
1623         UNLOCK(&manager->lock);
1624 }
1625
1626 /*
1627  * Create a new 'type' socket managed by 'manager'.  Events
1628  * will be posted to 'task' and when dispatched 'action' will be
1629  * called with 'arg' as the arg value.  The new socket is returned
1630  * in 'socketp'.
1631  */
1632 static isc_result_t
1633 socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1634               isc_socket_t **socketp, isc_socket_t *dup_socket)
1635 {
1636         isc_socket_t *sock = NULL;
1637         isc_result_t result;
1638 #if defined(USE_CMSG)
1639         int on = 1;
1640 #endif
1641 #if defined(SO_RCVBUF)
1642         ISC_SOCKADDR_LEN_T optlen;
1643         int size;
1644 #endif
1645         int socket_errno;
1646         char strbuf[ISC_STRERRORSIZE];
1647
1648         REQUIRE(VALID_MANAGER(manager));
1649         REQUIRE(socketp != NULL && *socketp == NULL);
1650         REQUIRE(type != isc_sockettype_fdwatch);
1651
1652         if (dup_socket != NULL)
1653                 return (ISC_R_NOTIMPLEMENTED);
1654
1655         result = allocate_socket(manager, type, &sock);
1656         if (result != ISC_R_SUCCESS)
1657                 return (result);
1658
1659         sock->pf = pf;
1660 #if 0
1661         if (dup_socket == NULL) {
1662 #endif
1663                 switch (type) {
1664                 case isc_sockettype_udp:
1665                         sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1666                         if (sock->fd != INVALID_SOCKET) {
1667                                 result = connection_reset_fix(sock->fd);
1668                                 if (result != ISC_R_SUCCESS) {
1669                                         socket_log(__LINE__, sock,
1670                                                 NULL, EVENT, NULL, 0, 0,
1671                                                 "closed %d %d %d "
1672                                                 "con_reset_fix_failed",
1673                                                 sock->pending_recv,
1674                                                 sock->pending_send,
1675                                                 sock->references);
1676                                         closesocket(sock->fd);
1677                                         _set_state(sock, SOCK_CLOSED);
1678                                         sock->fd = INVALID_SOCKET;
1679                                         free_socket(&sock, __LINE__);
1680                                         return (result);
1681                                 }
1682                         }
1683                         break;
1684                 case isc_sockettype_tcp:
1685                         sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1686                         break;
1687                 }
1688 #if 0
1689         } else {
1690                 /*
1691                  * XXX: dup() is deprecated in windows, use _dup()
1692                  * instead.  In future we may want to investigate
1693                  * WSADuplicateSocket().
1694                  */
1695                 sock->fd = _dup(dup_socket->fd);
1696                 sock->dupped = 1;
1697                 sock->bound = dup_socket->bound;
1698         }
1699 #endif
1700
1701         if (sock->fd == INVALID_SOCKET) {
1702                 socket_errno = WSAGetLastError();
1703                 free_socket(&sock, __LINE__);
1704
1705                 switch (socket_errno) {
1706                 case WSAEMFILE:
1707                 case WSAENOBUFS:
1708                         return (ISC_R_NORESOURCES);
1709
1710                 case WSAEPROTONOSUPPORT:
1711                 case WSAEPFNOSUPPORT:
1712                 case WSAEAFNOSUPPORT:
1713                         return (ISC_R_FAMILYNOSUPPORT);
1714
1715                 default:
1716                         isc__strerror(socket_errno, strbuf, sizeof(strbuf));
1717                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1718                                          "socket() %s: %s",
1719                                          isc_msgcat_get(isc_msgcat,
1720                                                         ISC_MSGSET_GENERAL,
1721                                                         ISC_MSG_FAILED,
1722                                                         "failed"),
1723                                          strbuf);
1724                         return (ISC_R_UNEXPECTED);
1725                 }
1726         }
1727
1728         result = make_nonblock(sock->fd);
1729         if (result != ISC_R_SUCCESS) {
1730                 socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1731                         "closed %d %d %d make_nonblock_failed",
1732                         sock->pending_recv, sock->pending_send,
1733                         sock->references);
1734                 closesocket(sock->fd);
1735                 sock->fd = INVALID_SOCKET;
1736                 free_socket(&sock, __LINE__);
1737                 return (result);
1738         }
1739
1740
1741 #if defined(USE_CMSG) || defined(SO_RCVBUF)
1742         if (type == isc_sockettype_udp) {
1743
1744 #if defined(USE_CMSG)
1745 #if defined(ISC_PLATFORM_HAVEIPV6)
1746 #ifdef IPV6_RECVPKTINFO
1747                 /* 2292bis */
1748                 if ((pf == AF_INET6)
1749                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1750                                    (char *)&on, sizeof(on)) < 0)) {
1751                         isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1752                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1753                                          "setsockopt(%d, IPV6_RECVPKTINFO) "
1754                                          "%s: %s", sock->fd,
1755                                          isc_msgcat_get(isc_msgcat,
1756                                                         ISC_MSGSET_GENERAL,
1757                                                         ISC_MSG_FAILED,
1758                                                         "failed"),
1759                                          strbuf);
1760                 }
1761 #else
1762                 /* 2292 */
1763                 if ((pf == AF_INET6)
1764                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1765                                    (char *)&on, sizeof(on)) < 0)) {
1766                         isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1767                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1768                                          "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1769                                          sock->fd,
1770                                          isc_msgcat_get(isc_msgcat,
1771                                                         ISC_MSGSET_GENERAL,
1772                                                         ISC_MSG_FAILED,
1773                                                         "failed"),
1774                                          strbuf);
1775                 }
1776 #endif /* IPV6_RECVPKTINFO */
1777 #ifdef IPV6_USE_MIN_MTU /*2292bis, not too common yet*/
1778                 /* use minimum MTU */
1779                 if (pf == AF_INET6) {
1780                         (void)setsockopt(sock->fd, IPPROTO_IPV6,
1781                                          IPV6_USE_MIN_MTU,
1782                                          (char *)&on, sizeof(on));
1783                 }
1784 #endif
1785 #endif /* ISC_PLATFORM_HAVEIPV6 */
1786 #endif /* defined(USE_CMSG) */
1787
1788 #if defined(SO_RCVBUF)
1789                optlen = sizeof(size);
1790                if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1791                               (char *)&size, &optlen) >= 0 &&
1792                     size < RCVBUFSIZE) {
1793                        size = RCVBUFSIZE;
1794                        (void)setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1795                                         (char *)&size, sizeof(size));
1796                }
1797 #endif
1798
1799         }
1800 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
1801
1802         _set_state(sock, SOCK_OPEN);
1803         sock->references = 1;
1804         *socketp = sock;
1805
1806         iocompletionport_update(sock);
1807
1808         /*
1809          * Note we don't have to lock the socket like we normally would because
1810          * there are no external references to it yet.
1811          */
1812         LOCK(&manager->lock);
1813         ISC_LIST_APPEND(manager->socklist, sock, link);
1814         InterlockedIncrement(&manager->totalSockets);
1815         UNLOCK(&manager->lock);
1816
1817         socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat,
1818                    ISC_MSGSET_SOCKET, ISC_MSG_CREATED,
1819                    "created %u type %u", sock->fd, type);
1820
1821         return (ISC_R_SUCCESS);
1822 }
1823
1824 isc_result_t
1825 isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1826                    isc_socket_t **socketp)
1827 {
1828         return (socket_create(manager, pf, type, socketp, NULL));
1829 }
1830
1831 isc_result_t
1832 isc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp) {
1833         REQUIRE(VALID_SOCKET(sock));
1834         REQUIRE(socketp != NULL && *socketp == NULL);
1835
1836 #if 1
1837         return (ISC_R_NOTIMPLEMENTED);
1838 #else
1839         return (socket_create(sock->manager, sock->pf, sock->type,
1840                               socketp, sock));
1841 #endif
1842 }
1843
1844 isc_result_t
1845 isc_socket_open(isc_socket_t *sock) {
1846         REQUIRE(VALID_SOCKET(sock));
1847         REQUIRE(sock->type != isc_sockettype_fdwatch);
1848
1849         return (ISC_R_NOTIMPLEMENTED);
1850 }
1851
1852 /*
1853  * Attach to a socket.  Caller must explicitly detach when it is done.
1854  */
1855 void
1856 isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1857         REQUIRE(VALID_SOCKET(sock));
1858         REQUIRE(socketp != NULL && *socketp == NULL);
1859
1860         LOCK(&sock->lock);
1861         CONSISTENT(sock);
1862         sock->references++;
1863         UNLOCK(&sock->lock);
1864
1865         *socketp = sock;
1866 }
1867
1868 /*
1869  * Dereference a socket.  If this is the last reference to it, clean things
1870  * up by destroying the socket.
1871  */
1872 void
1873 isc__socket_detach(isc_socket_t **socketp) {
1874         isc_socket_t *sock;
1875         isc_boolean_t kill_socket = ISC_FALSE;
1876
1877         REQUIRE(socketp != NULL);
1878         sock = *socketp;
1879         REQUIRE(VALID_SOCKET(sock));
1880         REQUIRE(sock->type != isc_sockettype_fdwatch);
1881
1882         LOCK(&sock->lock);
1883         CONSISTENT(sock);
1884         REQUIRE(sock->references > 0);
1885         sock->references--;
1886
1887         socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1888                 "detach_socket %d %d %d",
1889                 sock->pending_recv, sock->pending_send,
1890                 sock->references);
1891
1892         if (sock->references == 0 && sock->fd != INVALID_SOCKET) {
1893                 closesocket(sock->fd);
1894                 sock->fd = INVALID_SOCKET;
1895                 _set_state(sock, SOCK_CLOSED);
1896         }
1897
1898         maybe_free_socket(&sock, __LINE__);
1899
1900         *socketp = NULL;
1901 }
1902
1903 isc_result_t
1904 isc_socket_close(isc_socket_t *sock) {
1905         REQUIRE(VALID_SOCKET(sock));
1906         REQUIRE(sock->type != isc_sockettype_fdwatch);
1907
1908         return (ISC_R_NOTIMPLEMENTED);
1909 }
1910
1911 /*
1912  * Dequeue an item off the given socket's read queue, set the result code
1913  * in the done event to the one provided, and send it to the task it was
1914  * destined for.
1915  *
1916  * If the event to be sent is on a list, remove it before sending.  If
1917  * asked to, send and detach from the task as well.
1918  *
1919  * Caller must have the socket locked if the event is attached to the socket.
1920  */
1921 static void
1922 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1923         isc_task_t *task;
1924
1925         task = (*dev)->ev_sender;
1926         (*dev)->ev_sender = sock;
1927
1928         if (ISC_LINK_LINKED(*dev, ev_link))
1929                 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1930
1931         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1932             == ISC_SOCKEVENTATTR_ATTACHED)
1933                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1934         else
1935                 isc_task_send(task, (isc_event_t **)dev);
1936
1937         CONSISTENT(sock);
1938 }
1939
1940 /*
1941  * See comments for send_recvdone_event() above.
1942  */
1943 static void
1944 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1945         isc_task_t *task;
1946
1947         INSIST(dev != NULL && *dev != NULL);
1948
1949         task = (*dev)->ev_sender;
1950         (*dev)->ev_sender = sock;
1951
1952         if (ISC_LINK_LINKED(*dev, ev_link))
1953                 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1954
1955         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1956             == ISC_SOCKEVENTATTR_ATTACHED)
1957                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1958         else
1959                 isc_task_send(task, (isc_event_t **)dev);
1960
1961         CONSISTENT(sock);
1962 }
1963
1964 /*
1965  * See comments for send_recvdone_event() above.
1966  */
1967 static void
1968 send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev) {
1969         isc_task_t *task;
1970
1971         INSIST(adev != NULL && *adev != NULL);
1972
1973         task = (*adev)->ev_sender;
1974         (*adev)->ev_sender = sock;
1975
1976         if (ISC_LINK_LINKED(*adev, ev_link))
1977                 ISC_LIST_DEQUEUE(sock->accept_list, *adev, ev_link);
1978
1979         isc_task_sendanddetach(&task, (isc_event_t **)adev);
1980
1981         CONSISTENT(sock);
1982 }
1983
1984 /*
1985  * See comments for send_recvdone_event() above.
1986  */
1987 static void
1988 send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev) {
1989         isc_task_t *task;
1990
1991         INSIST(cdev != NULL && *cdev != NULL);
1992
1993         task = (*cdev)->ev_sender;
1994         (*cdev)->ev_sender = sock;
1995
1996         sock->connect_ev = NULL;
1997
1998         isc_task_sendanddetach(&task, (isc_event_t **)cdev);
1999
2000         CONSISTENT(sock);
2001 }
2002
2003 /*
2004  * On entry to this function, the event delivered is the internal
2005  * readable event, and the first item on the accept_list should be
2006  * the done event we want to send.  If the list is empty, this is a no-op,
2007  * so just close the new connection, unlock, and return.
2008  *
2009  * Note the socket is locked before entering here
2010  */
2011 static void
2012 internal_accept(isc_socket_t *sock, IoCompletionInfo *lpo, int accept_errno) {
2013         isc_socket_newconnev_t *adev;
2014         isc_result_t result = ISC_R_SUCCESS;
2015         isc_socket_t *nsock;
2016         struct sockaddr *localaddr;
2017         int localaddr_len = sizeof(*localaddr);
2018         struct sockaddr *remoteaddr;
2019         int remoteaddr_len = sizeof(*remoteaddr);
2020
2021         INSIST(VALID_SOCKET(sock));
2022         LOCK(&sock->lock);
2023         CONSISTENT(sock);
2024
2025         socket_log(__LINE__, sock, NULL, TRACE,
2026                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2027                    "internal_accept called");
2028
2029         INSIST(sock->listener);
2030
2031         INSIST(sock->pending_iocp > 0);
2032         sock->pending_iocp--;
2033         INSIST(sock->pending_accept > 0);
2034         sock->pending_accept--;
2035
2036         adev = lpo->adev;
2037
2038         /*
2039          * If the event is no longer in the list we can just return.
2040          */
2041         if (!acceptdone_is_active(sock, adev))
2042                 goto done;
2043
2044         nsock = adev->newsocket;
2045
2046         /*
2047          * Pull off the done event.
2048          */
2049         ISC_LIST_UNLINK(sock->accept_list, adev, ev_link);
2050
2051         /*
2052          * Extract the addresses from the socket, copy them into the structure,
2053          * and return the new socket.
2054          */
2055         ISCGetAcceptExSockaddrs(lpo->acceptbuffer, 0,
2056                 sizeof(SOCKADDR_STORAGE) + 16, sizeof(SOCKADDR_STORAGE) + 16,
2057                 (LPSOCKADDR *)&localaddr, &localaddr_len,
2058                 (LPSOCKADDR *)&remoteaddr, &remoteaddr_len);
2059         memcpy(&adev->address.type, remoteaddr, remoteaddr_len);
2060         adev->address.length = remoteaddr_len;
2061         nsock->address = adev->address;
2062         nsock->pf = adev->address.type.sa.sa_family;
2063
2064         socket_log(__LINE__, nsock, &nsock->address, TRACE,
2065                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2066                    "internal_accept parent %p", sock);
2067
2068         result = make_nonblock(adev->newsocket->fd);
2069         INSIST(result == ISC_R_SUCCESS);
2070
2071         INSIST(setsockopt(nsock->fd, SOL_SOCKET, SO_UPDATE_ACCEPT_CONTEXT,
2072                           (char *)&sock->fd, sizeof(sock->fd)) == 0);
2073
2074         /*
2075          * Hook it up into the manager.
2076          */
2077         nsock->bound = 1;
2078         nsock->connected = 1;
2079         _set_state(nsock, SOCK_OPEN);
2080
2081         LOCK(&nsock->manager->lock);
2082         ISC_LIST_APPEND(nsock->manager->socklist, nsock, link);
2083         InterlockedIncrement(&nsock->manager->totalSockets);
2084         UNLOCK(&nsock->manager->lock);
2085
2086         socket_log(__LINE__, sock, &nsock->address, CREATION,
2087                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2088                    "accepted_connection new_socket %p fd %d",
2089                    nsock, nsock->fd);
2090
2091         adev->result = result;
2092         send_acceptdone_event(sock, &adev);
2093
2094 done:
2095         CONSISTENT(sock);
2096         UNLOCK(&sock->lock);
2097
2098         HeapFree(hHeapHandle, 0, lpo->acceptbuffer);
2099         lpo->acceptbuffer = NULL;
2100 }
2101
2102 /*
2103  * Called when a socket with a pending connect() finishes.
2104  * Note that the socket is locked before entering.
2105  */
2106 static void
2107 internal_connect(isc_socket_t *sock, IoCompletionInfo *lpo, int connect_errno) {
2108         isc_socket_connev_t *cdev;
2109         char strbuf[ISC_STRERRORSIZE];
2110
2111         INSIST(VALID_SOCKET(sock));
2112
2113         LOCK(&sock->lock);
2114
2115         INSIST(sock->pending_iocp > 0);
2116         sock->pending_iocp--;
2117         INSIST(sock->pending_connect == 1);
2118         sock->pending_connect = 0;
2119
2120         /*
2121          * Has this event been canceled?
2122          */
2123         cdev = lpo->cdev;
2124         if (!connectdone_is_active(sock, cdev)) {
2125                 sock->pending_connect = 0;
2126                 if (sock->fd != INVALID_SOCKET) {
2127                         closesocket(sock->fd);
2128                         sock->fd = INVALID_SOCKET;
2129                         _set_state(sock, SOCK_CLOSED);
2130                 }
2131                 CONSISTENT(sock);
2132                 UNLOCK(&sock->lock);
2133                 return;
2134         }
2135
2136         /*
2137          * Check possible Windows network event error status here.
2138          */
2139         if (connect_errno != 0) {
2140                 /*
2141                  * If the error is SOFT, just try again on this
2142                  * fd and pretend nothing strange happened.
2143                  */
2144                 if (SOFT_ERROR(connect_errno) ||
2145                     connect_errno == WSAEINPROGRESS) {
2146                         sock->pending_connect = 1;
2147                         CONSISTENT(sock);
2148                         UNLOCK(&sock->lock);
2149                         return;
2150                 }
2151
2152                 /*
2153                  * Translate other errors into ISC_R_* flavors.
2154                  */
2155                 switch (connect_errno) {
2156 #define ERROR_MATCH(a, b) case a: cdev->result = b; break;
2157                         ERROR_MATCH(WSAEACCES, ISC_R_NOPERM);
2158                         ERROR_MATCH(WSAEADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2159                         ERROR_MATCH(WSAEAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2160                         ERROR_MATCH(WSAECONNREFUSED, ISC_R_CONNREFUSED);
2161                         ERROR_MATCH(WSAEHOSTUNREACH, ISC_R_HOSTUNREACH);
2162                         ERROR_MATCH(WSAEHOSTDOWN, ISC_R_HOSTDOWN);
2163                         ERROR_MATCH(WSAENETUNREACH, ISC_R_NETUNREACH);
2164                         ERROR_MATCH(WSAENETDOWN, ISC_R_NETDOWN);
2165                         ERROR_MATCH(WSAENOBUFS, ISC_R_NORESOURCES);
2166                         ERROR_MATCH(WSAECONNRESET, ISC_R_CONNECTIONRESET);
2167                         ERROR_MATCH(WSAECONNABORTED, ISC_R_CONNECTIONRESET);
2168                         ERROR_MATCH(WSAETIMEDOUT, ISC_R_TIMEDOUT);
2169 #undef ERROR_MATCH
2170                 default:
2171                         cdev->result = ISC_R_UNEXPECTED;
2172                         isc__strerror(connect_errno, strbuf, sizeof(strbuf));
2173                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2174                                          "internal_connect: connect() %s",
2175                                          strbuf);
2176                 }
2177         } else {
2178                 INSIST(setsockopt(sock->fd, SOL_SOCKET,
2179                                   SO_UPDATE_CONNECT_CONTEXT, NULL, 0) == 0);
2180                 cdev->result = ISC_R_SUCCESS;
2181                 sock->connected = 1;
2182                 socket_log(__LINE__, sock, &sock->address, IOEVENT,
2183                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2184                            "internal_connect: success");
2185         }
2186
2187         send_connectdone_event(sock, &cdev);
2188
2189         UNLOCK(&sock->lock);
2190 }
2191
2192 /*
2193  * Loop through the socket, returning ISC_R_EOF for each done event pending.
2194  */
2195 static void
2196 send_recvdone_abort(isc_socket_t *sock, isc_result_t result) {
2197         isc_socketevent_t *dev;
2198
2199         while (!ISC_LIST_EMPTY(sock->recv_list)) {
2200                 dev = ISC_LIST_HEAD(sock->recv_list);
2201                 dev->result = result;
2202                 send_recvdone_event(sock, &dev);
2203         }
2204 }
2205
2206 /*
2207  * Take the data we received in our private buffer, and if any recv() calls on
2208  * our list are satisfied, send the corresponding done event.
2209  *
2210  * If we need more data (there are still items on the recv_list after we consume all
2211  * our data) then arrange for another system recv() call to fill our buffers.
2212  */
2213 static void
2214 internal_recv(isc_socket_t *sock, int nbytes)
2215 {
2216         INSIST(VALID_SOCKET(sock));
2217
2218         LOCK(&sock->lock);
2219         CONSISTENT(sock);
2220
2221         socket_log(__LINE__, sock, NULL, IOEVENT,
2222                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2223                    "internal_recv: %d bytes received", nbytes);
2224
2225         /*
2226          * If we got here, the I/O operation succeeded.  However, we might still have removed this
2227          * event from our notification list (or never placed it on it due to immediate completion.)
2228          * Handle the reference counting here, and handle the cancellation event just after.
2229          */
2230         INSIST(sock->pending_iocp > 0);
2231         sock->pending_iocp--;
2232         INSIST(sock->pending_recv > 0);
2233         sock->pending_recv--;
2234
2235         /*
2236          * The only way we could have gotten here is that our I/O has successfully completed.
2237          * Update our pointers, and move on.  The only odd case here is that we might not
2238          * have received enough data on a TCP stream to satisfy the minimum requirements.  If
2239          * this is the case, we will re-issue the recv() call for what we need.
2240          *
2241          * We do check for a recv() of 0 bytes on a TCP stream.  This means the remote end
2242          * has closed.
2243          */
2244         if (nbytes == 0 && sock->type == isc_sockettype_tcp) {
2245                 send_recvdone_abort(sock, ISC_R_EOF);
2246                 maybe_free_socket(&sock, __LINE__);
2247                 return;
2248         }
2249         sock->recvbuf.remaining = nbytes;
2250         sock->recvbuf.consume_position = sock->recvbuf.base;
2251         completeio_recv(sock);
2252
2253         /*
2254          * If there are more receivers waiting for data, queue another receive
2255          * here.
2256          */
2257         queue_receive_request(sock);
2258
2259         /*
2260          * Unlock and/or destroy if we are the last thing this socket has left to do.
2261          */
2262         maybe_free_socket(&sock, __LINE__);
2263 }
2264
2265 static void
2266 internal_send(isc_socket_t *sock, isc_socketevent_t *dev,
2267               struct msghdr *messagehdr, int nbytes, int send_errno, IoCompletionInfo *lpo)
2268 {
2269         buflist_t *buffer;
2270
2271         /*
2272          * Find out what socket this is and lock it.
2273          */
2274         INSIST(VALID_SOCKET(sock));
2275
2276         LOCK(&sock->lock);
2277         CONSISTENT(sock);
2278
2279         socket_log(__LINE__, sock, NULL, IOEVENT,
2280                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2281                    "internal_send: task got socket event %p", dev);
2282
2283         buffer = ISC_LIST_HEAD(lpo->bufferlist);
2284         while (buffer != NULL) {
2285                 ISC_LIST_DEQUEUE(lpo->bufferlist, buffer, link);
2286
2287                 socket_log(__LINE__, sock, NULL, TRACE,
2288                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2289                    "free_buffer %p %p", buffer, buffer->buf);
2290
2291                 HeapFree(hHeapHandle, 0, buffer->buf);
2292                 HeapFree(hHeapHandle, 0, buffer);
2293                 buffer = ISC_LIST_HEAD(lpo->bufferlist);
2294         }
2295
2296         INSIST(sock->pending_iocp > 0);
2297         sock->pending_iocp--;
2298         INSIST(sock->pending_send > 0);
2299         sock->pending_send--;
2300
2301         /* If the event is no longer in the list we can just return */
2302         if (!senddone_is_active(sock, dev))
2303                 goto done;
2304
2305         /*
2306          * Set the error code and send things on its way.
2307          */
2308         switch (completeio_send(sock, dev, messagehdr, nbytes, send_errno)) {
2309         case DOIO_SOFT:
2310                 break;
2311         case DOIO_HARD:
2312         case DOIO_SUCCESS:
2313                 send_senddone_event(sock, &dev);
2314                 break;
2315         }
2316
2317  done:
2318         maybe_free_socket(&sock, __LINE__);
2319 }
2320
2321 /*
2322  * These return if the done event passed in is on the list (or for connect, is
2323  * the one we're waiting for.  Using these ensures we will not double-send an
2324  * event.
2325  */
2326 static isc_boolean_t
2327 senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev)
2328 {
2329         isc_socketevent_t *ldev;
2330
2331         ldev = ISC_LIST_HEAD(sock->send_list);
2332         while (ldev != NULL && ldev != dev)
2333                 ldev = ISC_LIST_NEXT(ldev, ev_link);
2334
2335         return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2336 }
2337
2338 static isc_boolean_t
2339 acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev)
2340 {
2341         isc_socket_newconnev_t *ldev;
2342
2343         ldev = ISC_LIST_HEAD(sock->accept_list);
2344         while (ldev != NULL && ldev != dev)
2345                 ldev = ISC_LIST_NEXT(ldev, ev_link);
2346
2347         return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2348 }
2349
2350 static isc_boolean_t
2351 connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev)
2352 {
2353         return (sock->connect_ev == dev ? ISC_TRUE : ISC_FALSE);
2354 }
2355
2356 //
2357 // The Windows network stack seems to have two very distinct paths depending
2358 // on what is installed.  Specifically, if something is looking at network
2359 // connections (like an anti-virus or anti-malware application, such as
2360 // McAfee products) Windows may return additional error conditions which
2361 // were not previously returned.
2362 //
2363 // One specific one is when a TCP SYN scan is used.  In this situation,
2364 // Windows responds with the SYN-ACK, but the scanner never responds with
2365 // the 3rd packet, the ACK.  Windows consiers this a partially open connection.
2366 // Most Unix networking stacks, and Windows without McAfee installed, will
2367 // not return this to the caller.  However, with this product installed,
2368 // Windows returns this as a failed status on the Accept() call.  Here, we
2369 // will just re-issue the ISCAcceptEx() call as if nothing had happened.
2370 //
2371 // This code should only be called when the listening socket has received
2372 // such an error.  Additionally, the "parent" socket must be locked.
2373 // Additionally, the lpo argument is re-used here, and must not be freed
2374 // by the caller.
2375 //
2376 static isc_result_t
2377 restart_accept(isc_socket_t *parent, IoCompletionInfo *lpo)
2378 {
2379         isc_socket_t *nsock = lpo->adev->newsocket;
2380         SOCKET new_fd;
2381
2382         /*
2383          * AcceptEx() requires we pass in a socket.  Note that we carefully
2384          * do not close the previous socket in case of an error message returned by
2385          * our new socket() call.  If we return an error here, our caller will
2386          * clean up.
2387          */
2388         new_fd = socket(parent->pf, SOCK_STREAM, IPPROTO_TCP);
2389         if (nsock->fd == INVALID_SOCKET) {
2390                 return (ISC_R_FAILURE); // parent will ask windows for error message
2391         }
2392         closesocket(nsock->fd);
2393         nsock->fd = new_fd;
2394
2395         memset(&lpo->overlapped, 0, sizeof(lpo->overlapped));
2396
2397         ISCAcceptEx(parent->fd,
2398                     nsock->fd,                          /* Accepted Socket */
2399                     lpo->acceptbuffer,                  /* Buffer for initial Recv */
2400                     0,                                  /* Length of Buffer */
2401                     sizeof(SOCKADDR_STORAGE) + 16,      /* Local address length + 16 */
2402                     sizeof(SOCKADDR_STORAGE) + 16,      /* Remote address lengh + 16 */
2403                     (LPDWORD)&lpo->received_bytes,      /* Bytes Recved */
2404                     (LPOVERLAPPED)lpo                   /* Overlapped structure */
2405                     );
2406
2407         InterlockedDecrement(&nsock->manager->iocp_total);
2408         iocompletionport_update(nsock);
2409
2410         return (ISC_R_SUCCESS);
2411 }
2412
2413 /*
2414  * This is the I/O Completion Port Worker Function. It loops forever
2415  * waiting for I/O to complete and then forwards them for further
2416  * processing. There are a number of these in separate threads.
2417  */
2418 static isc_threadresult_t WINAPI
2419 SocketIoThread(LPVOID ThreadContext) {
2420         isc_socketmgr_t *manager = ThreadContext;
2421         BOOL bSuccess = FALSE;
2422         DWORD nbytes;
2423         IoCompletionInfo *lpo = NULL;
2424         isc_socket_t *sock = NULL;
2425         int request;
2426         struct msghdr *messagehdr = NULL;
2427         int errval;
2428         char strbuf[ISC_STRERRORSIZE];
2429         int errstatus;
2430
2431         REQUIRE(VALID_MANAGER(manager));
2432
2433         /*
2434          * Set the thread priority high enough so I/O will
2435          * preempt normal recv packet processing, but not
2436          * higher than the timer sync thread.
2437          */
2438         if (!SetThreadPriority(GetCurrentThread(),
2439                                THREAD_PRIORITY_ABOVE_NORMAL)) {
2440                 errval = GetLastError();
2441                 isc__strerror(errval, strbuf, sizeof(strbuf));
2442                 FATAL_ERROR(__FILE__, __LINE__,
2443                                 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2444                                 ISC_MSG_FAILED,
2445                                 "Can't set thread priority: %s"),
2446                                 strbuf);
2447         }
2448
2449         /*
2450          * Loop forever waiting on I/O Completions and then processing them
2451          */
2452         while (TRUE) {
2453                 wait_again:
2454                 bSuccess = GetQueuedCompletionStatus(manager->hIoCompletionPort,
2455                                                      &nbytes, (LPDWORD)&sock,
2456                                                      (LPWSAOVERLAPPED *)&lpo,
2457                                                      INFINITE);
2458                 if (lpo == NULL) /* Received request to exit */
2459                         break;
2460
2461                 REQUIRE(VALID_SOCKET(sock));
2462
2463                 request = lpo->request_type;
2464
2465                 errstatus = 0;
2466                 if (!bSuccess) {
2467                         isc_result_t isc_result;
2468
2469                         /*
2470                          * Did the I/O operation complete?
2471                          */
2472                         errstatus = GetLastError();
2473                         isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2474
2475                         LOCK(&sock->lock);
2476                         CONSISTENT(sock);
2477                         switch (request) {
2478                         case SOCKET_RECV:
2479                                 INSIST(sock->pending_iocp > 0);
2480                                 sock->pending_iocp--;
2481                                 INSIST(sock->pending_recv > 0);
2482                                 sock->pending_recv--;
2483                                 if (!sock->connected &&
2484                                     ((errstatus == ERROR_HOST_UNREACHABLE) ||
2485                                      (errstatus == WSAENETRESET) ||
2486                                      (errstatus == WSAECONNRESET))) {
2487                                         /* ignore soft errors */
2488                                         queue_receive_request(sock);
2489                                         break;
2490                                 }
2491                                 send_recvdone_abort(sock, isc_result);
2492                                 if (isc_result == ISC_R_UNEXPECTED) {
2493                                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2494                                                 "SOCKET_RECV: Windows error code: %d, returning ISC error %d",
2495                                                 errstatus, isc_result);
2496                                 }
2497                                 break;
2498
2499                         case SOCKET_SEND:
2500                                 INSIST(sock->pending_iocp > 0);
2501                                 sock->pending_iocp--;
2502                                 INSIST(sock->pending_send > 0);
2503                                 sock->pending_send--;
2504                                 if (senddone_is_active(sock, lpo->dev)) {
2505                                         lpo->dev->result = isc_result;
2506                                         socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2507                                                 "canceled_send");
2508                                         send_senddone_event(sock, &lpo->dev);
2509                                 }
2510                                 break;
2511
2512                         case SOCKET_ACCEPT:
2513                                 INSIST(sock->pending_iocp > 0);
2514                                 INSIST(sock->pending_accept > 0);
2515
2516                                 socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2517                                         "Accept: errstatus=%d isc_result=%d", errstatus, isc_result);
2518
2519                                 if (acceptdone_is_active(sock, lpo->adev)) {
2520                                         if (restart_accept(sock, lpo) == ISC_R_SUCCESS) {
2521                                                 UNLOCK(&sock->lock);
2522                                                 goto wait_again;
2523                                         } else {
2524                                                 errstatus = GetLastError();
2525                                                 isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2526                                                 socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2527                                                         "restart_accept() failed: errstatus=%d isc_result=%d",
2528                                                         errstatus, isc_result);
2529                                         }
2530                                 }
2531
2532                                 sock->pending_iocp--;
2533                                 sock->pending_accept--;
2534                                 if (acceptdone_is_active(sock, lpo->adev)) {
2535                                         closesocket(lpo->adev->newsocket->fd);
2536                                         lpo->adev->newsocket->fd = INVALID_SOCKET;
2537                                         lpo->adev->newsocket->references--;
2538                                         free_socket(&lpo->adev->newsocket, __LINE__);
2539                                         lpo->adev->result = isc_result;
2540                                         socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2541                                                 "canceled_accept");
2542                                         send_acceptdone_event(sock, &lpo->adev);
2543                                 }
2544                                 break;
2545
2546                         case SOCKET_CONNECT:
2547                                 INSIST(sock->pending_iocp > 0);
2548                                 sock->pending_iocp--;
2549                                 INSIST(sock->pending_connect == 1);
2550                                 sock->pending_connect = 0;
2551                                 if (connectdone_is_active(sock, lpo->cdev)) {
2552                                         lpo->cdev->result = isc_result;
2553                                         socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2554                                                 "canceled_connect");
2555                                         send_connectdone_event(sock, &lpo->cdev);
2556                                 }
2557                                 break;
2558                         }
2559                         maybe_free_socket(&sock, __LINE__);
2560
2561                         if (lpo != NULL)
2562                                 HeapFree(hHeapHandle, 0, lpo);
2563                         continue;
2564                 }
2565
2566                 messagehdr = &lpo->messagehdr;
2567
2568                 switch (request) {
2569                 case SOCKET_RECV:
2570                         internal_recv(sock, nbytes);
2571                         break;
2572                 case SOCKET_SEND:
2573                         internal_send(sock, lpo->dev, messagehdr, nbytes, errstatus, lpo);
2574                         break;
2575                 case SOCKET_ACCEPT:
2576                         internal_accept(sock, lpo, errstatus);
2577                         break;
2578                 case SOCKET_CONNECT:
2579                         internal_connect(sock, lpo, errstatus);
2580                         break;
2581                 }
2582
2583                 if (lpo != NULL)
2584                         HeapFree(hHeapHandle, 0, lpo);
2585         }
2586
2587         /*
2588          * Exit Completion Port Thread
2589          */
2590         manager_log(manager, TRACE,
2591                     isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2592                                    ISC_MSG_EXITING, "SocketIoThread exiting"));
2593         return ((isc_threadresult_t)0);
2594 }
2595
2596 /*
2597  * Create a new socket manager.
2598  */
2599 isc_result_t
2600 isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2601         return (isc_socketmgr_create2(mctx, managerp, 0));
2602 }
2603
2604 isc_result_t
2605 isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
2606                        unsigned int maxsocks)
2607 {
2608         isc_socketmgr_t *manager;
2609         isc_result_t result;
2610
2611         REQUIRE(managerp != NULL && *managerp == NULL);
2612
2613         if (maxsocks != 0)
2614                 return (ISC_R_NOTIMPLEMENTED);
2615
2616         manager = isc_mem_get(mctx, sizeof(*manager));
2617         if (manager == NULL)
2618                 return (ISC_R_NOMEMORY);
2619
2620         InitSockets();
2621
2622         manager->magic = SOCKET_MANAGER_MAGIC;
2623         manager->mctx = NULL;
2624         manager->stats = NULL;
2625         ISC_LIST_INIT(manager->socklist);
2626         result = isc_mutex_init(&manager->lock);
2627         if (result != ISC_R_SUCCESS) {
2628                 isc_mem_put(mctx, manager, sizeof(*manager));
2629                 return (result);
2630         }
2631         if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2632                 DESTROYLOCK(&manager->lock);
2633                 isc_mem_put(mctx, manager, sizeof(*manager));
2634                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2635                                  "isc_condition_init() %s",
2636                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2637                                                 ISC_MSG_FAILED, "failed"));
2638                 return (ISC_R_UNEXPECTED);
2639         }
2640
2641         isc_mem_attach(mctx, &manager->mctx);
2642
2643         iocompletionport_init(manager); /* Create the Completion Ports */
2644
2645         manager->bShutdown = ISC_FALSE;
2646         manager->totalSockets = 0;
2647         manager->iocp_total = 0;
2648
2649         *managerp = manager;
2650
2651         return (ISC_R_SUCCESS);
2652 }
2653
2654 isc_result_t
2655 isc__socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
2656         REQUIRE(VALID_MANAGER(manager));
2657         REQUIRE(nsockp != NULL);
2658
2659         return (ISC_R_NOTIMPLEMENTED);
2660 }
2661
2662 void
2663 isc__socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
2664         REQUIRE(VALID_MANAGER(manager));
2665         REQUIRE(ISC_LIST_EMPTY(manager->socklist));
2666         REQUIRE(manager->stats == NULL);
2667         REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
2668
2669         isc_stats_attach(stats, &manager->stats);
2670 }
2671
2672 void
2673 isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
2674         isc_socketmgr_t *manager;
2675         int i;
2676         isc_mem_t *mctx;
2677
2678         /*
2679          * Destroy a socket manager.
2680          */
2681
2682         REQUIRE(managerp != NULL);
2683         manager = *managerp;
2684         REQUIRE(VALID_MANAGER(manager));
2685
2686         LOCK(&manager->lock);
2687
2688         /*
2689          * Wait for all sockets to be destroyed.
2690          */
2691         while (!ISC_LIST_EMPTY(manager->socklist)) {
2692                 manager_log(manager, CREATION,
2693                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2694                                            ISC_MSG_SOCKETSREMAIN,
2695                                            "sockets exist"));
2696                 WAIT(&manager->shutdown_ok, &manager->lock);
2697         }
2698
2699         UNLOCK(&manager->lock);
2700
2701         /*
2702          * Here, we need to had some wait code for the completion port
2703          * thread.
2704          */
2705         signal_iocompletionport_exit(manager);
2706         manager->bShutdown = ISC_TRUE;
2707
2708         /*
2709          * Wait for threads to exit.
2710          */
2711         for (i = 0; i < manager->maxIOCPThreads; i++) {
2712                 if (isc_thread_join((isc_thread_t) manager->hIOCPThreads[i],
2713                         NULL) != ISC_R_SUCCESS)
2714                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2715                                  "isc_thread_join() for Completion Port %s",
2716                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2717                                                 ISC_MSG_FAILED, "failed"));
2718         }
2719         /*
2720          * Clean up.
2721          */
2722
2723         CloseHandle(manager->hIoCompletionPort);
2724
2725         (void)isc_condition_destroy(&manager->shutdown_ok);
2726
2727         DESTROYLOCK(&manager->lock);
2728         if (manager->stats != NULL)
2729                 isc_stats_detach(&manager->stats);
2730         manager->magic = 0;
2731         mctx= manager->mctx;
2732         isc_mem_put(mctx, manager, sizeof(*manager));
2733
2734         isc_mem_detach(&mctx);
2735
2736         *managerp = NULL;
2737 }
2738
2739 static void
2740 queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev)
2741 {
2742         isc_task_t *ntask = NULL;
2743
2744         isc_task_attach(task, &ntask);
2745         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2746
2747         /*
2748          * Enqueue the request.
2749          */
2750         INSIST(!ISC_LINK_LINKED(dev, ev_link));
2751         ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2752
2753         socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2754                    "queue_receive_event: event %p -> task %p",
2755                    dev, ntask);
2756 }
2757
2758 /*
2759  * Check the pending receive queue, and if we have data pending, give it to this
2760  * caller.  If we have none, queue an I/O request.  If this caller is not the first
2761  * on the list, then we will just queue this event and return.
2762  *
2763  * Caller must have the socket locked.
2764  */
2765 static isc_result_t
2766 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2767             unsigned int flags)
2768 {
2769         int cc = 0;
2770         isc_task_t *ntask = NULL;
2771         isc_result_t result = ISC_R_SUCCESS;
2772         int recv_errno = 0;
2773
2774         dev->ev_sender = task;
2775
2776         if (sock->fd == INVALID_SOCKET)
2777                 return (ISC_R_EOF);
2778
2779         /*
2780          * Queue our event on the list of things to do.  Call our function to
2781          * attempt to fill buffers as much as possible, and return done events.
2782          * We are going to lie about our handling of the ISC_SOCKFLAG_IMMEDIATE
2783          * here and tell our caller that we could not satisfy it immediately.
2784          */
2785         queue_receive_event(sock, task, dev);
2786         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2787                 result = ISC_R_INPROGRESS;
2788
2789         completeio_recv(sock);
2790
2791         /*
2792          * If there are more receivers waiting for data, queue another receive
2793          * here.  If the
2794          */
2795         queue_receive_request(sock);
2796
2797         return (result);
2798 }
2799
2800 isc_result_t
2801 isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2802                  unsigned int minimum, isc_task_t *task,
2803                  isc_taskaction_t action, const void *arg)
2804 {
2805         isc_socketevent_t *dev;
2806         isc_socketmgr_t *manager;
2807         unsigned int iocount;
2808         isc_buffer_t *buffer;
2809         isc_result_t ret;
2810
2811         REQUIRE(VALID_SOCKET(sock));
2812         LOCK(&sock->lock);
2813         CONSISTENT(sock);
2814
2815         /*
2816          * Make sure that the socket is not closed.  XXXMLG change error here?
2817          */
2818         if (sock->fd == INVALID_SOCKET) {
2819                 UNLOCK(&sock->lock);
2820                 return (ISC_R_CONNREFUSED);
2821         }
2822
2823         REQUIRE(buflist != NULL);
2824         REQUIRE(!ISC_LIST_EMPTY(*buflist));
2825         REQUIRE(task != NULL);
2826         REQUIRE(action != NULL);
2827
2828         manager = sock->manager;
2829         REQUIRE(VALID_MANAGER(manager));
2830
2831         iocount = isc_bufferlist_availablecount(buflist);
2832         REQUIRE(iocount > 0);
2833
2834         INSIST(sock->bound);
2835
2836         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2837         if (dev == NULL) {
2838                 UNLOCK(&sock->lock);
2839                 return (ISC_R_NOMEMORY);
2840         }
2841
2842         /*
2843          * UDP sockets are always partial read
2844          */
2845         if (sock->type == isc_sockettype_udp)
2846                 dev->minimum = 1;
2847         else {
2848                 if (minimum == 0)
2849                         dev->minimum = iocount;
2850                 else
2851                         dev->minimum = minimum;
2852         }
2853
2854         /*
2855          * Move each buffer from the passed in list to our internal one.
2856          */
2857         buffer = ISC_LIST_HEAD(*buflist);
2858         while (buffer != NULL) {
2859                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
2860                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2861                 buffer = ISC_LIST_HEAD(*buflist);
2862         }
2863
2864         ret = socket_recv(sock, dev, task, 0);
2865
2866         UNLOCK(&sock->lock);
2867         return (ret);
2868 }
2869
2870 isc_result_t
2871 isc__socket_recv(isc_socket_t *sock, isc_region_t *region,
2872                  unsigned int minimum, isc_task_t *task,
2873                  isc_taskaction_t action, const void *arg)
2874 {
2875         isc_socketevent_t *dev;
2876         isc_socketmgr_t *manager;
2877         isc_result_t ret;
2878
2879         REQUIRE(VALID_SOCKET(sock));
2880         LOCK(&sock->lock);
2881         CONSISTENT(sock);
2882
2883         /*
2884          * make sure that the socket's not closed
2885          */
2886         if (sock->fd == INVALID_SOCKET) {
2887                 UNLOCK(&sock->lock);
2888                 return (ISC_R_CONNREFUSED);
2889         }
2890         REQUIRE(action != NULL);
2891
2892         manager = sock->manager;
2893         REQUIRE(VALID_MANAGER(manager));
2894
2895         INSIST(sock->bound);
2896
2897         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2898         if (dev == NULL) {
2899                 UNLOCK(&sock->lock);
2900                 return (ISC_R_NOMEMORY);
2901         }
2902
2903         ret = isc_socket_recv2(sock, region, minimum, task, dev, 0);
2904         UNLOCK(&sock->lock);
2905         return (ret);
2906 }
2907
2908 isc_result_t
2909 isc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
2910                   unsigned int minimum, isc_task_t *task,
2911                   isc_socketevent_t *event, unsigned int flags)
2912 {
2913         isc_result_t ret;
2914
2915         REQUIRE(VALID_SOCKET(sock));
2916         LOCK(&sock->lock);
2917         CONSISTENT(sock);
2918
2919         event->result = ISC_R_UNEXPECTED;
2920         event->ev_sender = sock;
2921         /*
2922          * make sure that the socket's not closed
2923          */
2924         if (sock->fd == INVALID_SOCKET) {
2925                 UNLOCK(&sock->lock);
2926                 return (ISC_R_CONNREFUSED);
2927         }
2928
2929         ISC_LIST_INIT(event->bufferlist);
2930         event->region = *region;
2931         event->n = 0;
2932         event->offset = 0;
2933         event->attributes = 0;
2934
2935         /*
2936          * UDP sockets are always partial read.
2937          */
2938         if (sock->type == isc_sockettype_udp)
2939                 event->minimum = 1;
2940         else {
2941                 if (minimum == 0)
2942                         event->minimum = region->length;
2943                 else
2944                         event->minimum = minimum;
2945         }
2946
2947         ret = socket_recv(sock, event, task, flags);
2948         UNLOCK(&sock->lock);
2949         return (ret);
2950 }
2951
2952 /*
2953  * Caller must have the socket locked.
2954  */
2955 static isc_result_t
2956 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2957             isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2958             unsigned int flags)
2959 {
2960         int io_state;
2961         int send_errno = 0;
2962         int cc = 0;
2963         isc_task_t *ntask = NULL;
2964         isc_result_t result = ISC_R_SUCCESS;
2965
2966         dev->ev_sender = task;
2967
2968         set_dev_address(address, sock, dev);
2969         if (pktinfo != NULL) {
2970                 socket_log(__LINE__, sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET,
2971                            ISC_MSG_PKTINFOPROVIDED,
2972                            "pktinfo structure provided, ifindex %u (set to 0)",
2973                            pktinfo->ipi6_ifindex);
2974
2975                 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2976                 dev->pktinfo = *pktinfo;
2977                 /*
2978                  * Set the pktinfo index to 0 here, to let the kernel decide
2979                  * what interface it should send on.
2980                  */
2981                 dev->pktinfo.ipi6_ifindex = 0;
2982         }
2983
2984         io_state = startio_send(sock, dev, &cc, &send_errno);
2985         switch (io_state) {
2986         case DOIO_PENDING:      /* I/O started. Nothing more to do */
2987         case DOIO_SOFT:
2988                 /*
2989                  * We couldn't send all or part of the request right now, so
2990                  * queue it unless ISC_SOCKFLAG_NORETRY is set.
2991                  */
2992                 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2993                         isc_task_attach(task, &ntask);
2994                         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2995
2996                         /*
2997                          * Enqueue the request.
2998                          */
2999                         INSIST(!ISC_LINK_LINKED(dev, ev_link));
3000                         ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
3001
3002                         socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
3003                                    "socket_send: event %p -> task %p",
3004                                    dev, ntask);
3005
3006                         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
3007                                 result = ISC_R_INPROGRESS;
3008                         break;
3009                 }
3010
3011         case DOIO_SUCCESS:
3012                 break;
3013         }
3014
3015         return (result);
3016 }
3017
3018 isc_result_t
3019 isc__socket_send(isc_socket_t *sock, isc_region_t *region,
3020                  isc_task_t *task, isc_taskaction_t action, const void *arg)
3021 {
3022         /*
3023          * REQUIRE() checking is performed in isc_socket_sendto().
3024          */
3025         return (isc_socket_sendto(sock, region, task, action, arg, NULL,
3026                                   NULL));
3027 }
3028
3029 isc_result_t
3030 isc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
3031                    isc_task_t *task, isc_taskaction_t action, const void *arg,
3032                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3033 {
3034         isc_socketevent_t *dev;
3035         isc_socketmgr_t *manager;
3036         isc_result_t ret;
3037
3038         REQUIRE(VALID_SOCKET(sock));
3039         REQUIRE(sock->type != isc_sockettype_fdwatch);
3040
3041         LOCK(&sock->lock);
3042         CONSISTENT(sock);
3043
3044         /*
3045          * make sure that the socket's not closed
3046          */
3047         if (sock->fd == INVALID_SOCKET) {
3048                 UNLOCK(&sock->lock);
3049                 return (ISC_R_CONNREFUSED);
3050         }
3051         REQUIRE(region != NULL);
3052         REQUIRE(task != NULL);
3053         REQUIRE(action != NULL);
3054
3055         manager = sock->manager;
3056         REQUIRE(VALID_MANAGER(manager));
3057
3058         INSIST(sock->bound);
3059
3060         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3061         if (dev == NULL) {
3062                 UNLOCK(&sock->lock);
3063                 return (ISC_R_NOMEMORY);
3064         }
3065         dev->region = *region;
3066
3067         ret = socket_send(sock, dev, task, address, pktinfo, 0);
3068         UNLOCK(&sock->lock);
3069         return (ret);
3070 }
3071
3072 isc_result_t
3073 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
3074                   isc_task_t *task, isc_taskaction_t action, const void *arg)
3075 {
3076         return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
3077                                    NULL));
3078 }
3079
3080 isc_result_t
3081 isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
3082                     isc_task_t *task, isc_taskaction_t action, const void *arg,
3083                     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3084 {
3085         isc_socketevent_t *dev;
3086         isc_socketmgr_t *manager;
3087         unsigned int iocount;
3088         isc_buffer_t *buffer;
3089         isc_result_t ret;
3090
3091         REQUIRE(VALID_SOCKET(sock));
3092
3093         LOCK(&sock->lock);
3094         CONSISTENT(sock);
3095
3096         /*
3097          * make sure that the socket's not closed
3098          */
3099         if (sock->fd == INVALID_SOCKET) {
3100                 UNLOCK(&sock->lock);
3101                 return (ISC_R_CONNREFUSED);
3102         }
3103         REQUIRE(buflist != NULL);
3104         REQUIRE(!ISC_LIST_EMPTY(*buflist));
3105         REQUIRE(task != NULL);
3106         REQUIRE(action != NULL);
3107
3108         manager = sock->manager;
3109         REQUIRE(VALID_MANAGER(manager));
3110
3111         iocount = isc_bufferlist_usedcount(buflist);
3112         REQUIRE(iocount > 0);
3113
3114         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3115         if (dev == NULL) {
3116                 UNLOCK(&sock->lock);
3117                 return (ISC_R_NOMEMORY);
3118         }
3119
3120         /*
3121          * Move each buffer from the passed in list to our internal one.
3122          */
3123         buffer = ISC_LIST_HEAD(*buflist);
3124         while (buffer != NULL) {
3125                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
3126                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
3127                 buffer = ISC_LIST_HEAD(*buflist);
3128         }
3129
3130         ret = socket_send(sock, dev, task, address, pktinfo, 0);
3131         UNLOCK(&sock->lock);
3132         return (ret);
3133 }
3134
3135 isc_result_t
3136 isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
3137                     isc_task_t *task,
3138                     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
3139                     isc_socketevent_t *event, unsigned int flags)
3140 {
3141         isc_result_t ret;
3142
3143         REQUIRE(VALID_SOCKET(sock));
3144         LOCK(&sock->lock);
3145         CONSISTENT(sock);
3146
3147         REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
3148         if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
3149                 REQUIRE(sock->type == isc_sockettype_udp);
3150         event->ev_sender = sock;
3151         event->result = ISC_R_UNEXPECTED;
3152         /*
3153          * make sure that the socket's not closed
3154          */
3155         if (sock->fd == INVALID_SOCKET) {
3156                 UNLOCK(&sock->lock);
3157                 return (ISC_R_CONNREFUSED);
3158         }
3159         ISC_LIST_INIT(event->bufferlist);
3160         event->region = *region;
3161         event->n = 0;
3162         event->offset = 0;
3163         event->attributes = 0;
3164
3165         ret = socket_send(sock, event, task, address, pktinfo, flags);
3166         UNLOCK(&sock->lock);
3167         return (ret);
3168 }
3169
3170 isc_result_t
3171 isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
3172                  unsigned int options) {
3173         int bind_errno;
3174         char strbuf[ISC_STRERRORSIZE];
3175         int on = 1;
3176
3177         REQUIRE(VALID_SOCKET(sock));
3178         LOCK(&sock->lock);
3179         CONSISTENT(sock);
3180
3181         /*
3182          * make sure that the socket's not closed
3183          */
3184         if (sock->fd == INVALID_SOCKET) {
3185                 UNLOCK(&sock->lock);
3186                 return (ISC_R_CONNREFUSED);
3187         }
3188
3189         INSIST(!sock->bound);
3190         INSIST(!sock->dupped);
3191
3192         if (sock->pf != sockaddr->type.sa.sa_family) {
3193                 UNLOCK(&sock->lock);
3194                 return (ISC_R_FAMILYMISMATCH);
3195         }
3196         /*
3197          * Only set SO_REUSEADDR when we want a specific port.
3198          */
3199         if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
3200             isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
3201             setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
3202                        sizeof(on)) < 0) {
3203                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3204                                  "setsockopt(%d) %s", sock->fd,
3205                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3206                                                 ISC_MSG_FAILED, "failed"));
3207                 /* Press on... */
3208         }
3209         if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
3210                 bind_errno = WSAGetLastError();
3211                 UNLOCK(&sock->lock);
3212                 switch (bind_errno) {
3213                 case WSAEACCES:
3214                         return (ISC_R_NOPERM);
3215                 case WSAEADDRNOTAVAIL:
3216                         return (ISC_R_ADDRNOTAVAIL);
3217                 case WSAEADDRINUSE:
3218                         return (ISC_R_ADDRINUSE);
3219                 case WSAEINVAL:
3220                         return (ISC_R_BOUND);
3221                 default:
3222                         isc__strerror(bind_errno, strbuf, sizeof(strbuf));
3223                         UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
3224                                          strbuf);
3225                         return (ISC_R_UNEXPECTED);
3226                 }
3227         }
3228
3229         socket_log(__LINE__, sock, sockaddr, TRACE,
3230                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
3231         sock->bound = 1;
3232
3233         UNLOCK(&sock->lock);
3234         return (ISC_R_SUCCESS);
3235 }
3236
3237 isc_result_t
3238 isc__socket_filter(isc_socket_t *sock, const char *filter) {
3239         UNUSED(sock);
3240         UNUSED(filter);
3241
3242         REQUIRE(VALID_SOCKET(sock));
3243         return (ISC_R_NOTIMPLEMENTED);
3244 }
3245
3246 /*
3247  * Set up to listen on a given socket.  We do this by creating an internal
3248  * event that will be dispatched when the socket has read activity.  The
3249  * watcher will send the internal event to the task when there is a new
3250  * connection.
3251  *
3252  * Unlike in read, we don't preallocate a done event here.  Every time there
3253  * is a new connection we'll have to allocate a new one anyway, so we might
3254  * as well keep things simple rather than having to track them.
3255  */
3256 isc_result_t
3257 isc__socket_listen(isc_socket_t *sock, unsigned int backlog) {
3258         char strbuf[ISC_STRERRORSIZE];
3259
3260         REQUIRE(VALID_SOCKET(sock));
3261
3262         LOCK(&sock->lock);
3263         CONSISTENT(sock);
3264
3265         /*
3266          * make sure that the socket's not closed
3267          */
3268         if (sock->fd == INVALID_SOCKET) {
3269                 UNLOCK(&sock->lock);
3270                 return (ISC_R_CONNREFUSED);
3271         }
3272
3273         REQUIRE(!sock->listener);
3274         REQUIRE(sock->bound);
3275         REQUIRE(sock->type == isc_sockettype_tcp);
3276
3277         if (backlog == 0)
3278                 backlog = SOMAXCONN;
3279
3280         if (listen(sock->fd, (int)backlog) < 0) {
3281                 UNLOCK(&sock->lock);
3282                 isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3283
3284                 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
3285
3286                 return (ISC_R_UNEXPECTED);
3287         }
3288
3289         socket_log(__LINE__, sock, NULL, TRACE,
3290                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "listening");
3291         sock->listener = 1;
3292         _set_state(sock, SOCK_LISTEN);
3293
3294         UNLOCK(&sock->lock);
3295         return (ISC_R_SUCCESS);
3296 }
3297
3298 /*
3299  * This should try to do aggressive accept() XXXMLG
3300  */
3301 isc_result_t
3302 isc__socket_accept(isc_socket_t *sock,
3303                    isc_task_t *task, isc_taskaction_t action, const void *arg)
3304 {
3305         isc_socket_newconnev_t *adev;
3306         isc_socketmgr_t *manager;
3307         isc_task_t *ntask = NULL;
3308         isc_socket_t *nsock;
3309         isc_result_t result;
3310         IoCompletionInfo *lpo;
3311
3312         REQUIRE(VALID_SOCKET(sock));
3313
3314         manager = sock->manager;
3315         REQUIRE(VALID_MANAGER(manager));
3316
3317         LOCK(&sock->lock);
3318         CONSISTENT(sock);
3319
3320         /*
3321          * make sure that the socket's not closed
3322          */
3323         if (sock->fd == INVALID_SOCKET) {
3324                 UNLOCK(&sock->lock);
3325                 return (ISC_R_CONNREFUSED);
3326         }
3327
3328         REQUIRE(sock->listener);
3329
3330         /*
3331          * Sender field is overloaded here with the task we will be sending
3332          * this event to.  Just before the actual event is delivered the
3333          * actual ev_sender will be touched up to be the socket.
3334          */
3335         adev = (isc_socket_newconnev_t *)
3336                 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
3337                                    action, arg, sizeof(*adev));
3338         if (adev == NULL) {
3339                 UNLOCK(&sock->lock);
3340                 return (ISC_R_NOMEMORY);
3341         }
3342         ISC_LINK_INIT(adev, ev_link);
3343
3344         result = allocate_socket(manager, sock->type, &nsock);
3345         if (result != ISC_R_SUCCESS) {
3346                 isc_event_free((isc_event_t **)&adev);
3347                 UNLOCK(&sock->lock);
3348                 return (result);
3349         }
3350
3351         /*
3352          * AcceptEx() requires we pass in a socket.
3353          */
3354         nsock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
3355         if (nsock->fd == INVALID_SOCKET) {
3356                 free_socket(&nsock, __LINE__);
3357                 isc_event_free((isc_event_t **)&adev);
3358                 UNLOCK(&sock->lock);
3359                 return (ISC_R_FAILURE); // XXXMLG need real error message
3360         }
3361
3362         /*
3363          * Attach to socket and to task.
3364          */
3365         isc_task_attach(task, &ntask);
3366         if (isc_task_exiting(ntask)) {
3367                 free_socket(&nsock, __LINE__);
3368                 isc_task_detach(&ntask);
3369                 isc_event_free(ISC_EVENT_PTR(&adev));
3370                 UNLOCK(&sock->lock);
3371                 return (ISC_R_SHUTTINGDOWN);
3372         }
3373         nsock->references++;
3374
3375         adev->ev_sender = ntask;
3376         adev->newsocket = nsock;
3377         _set_state(nsock, SOCK_ACCEPT);
3378
3379         /*
3380          * Queue io completion for an accept().
3381          */
3382         lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3383                                             HEAP_ZERO_MEMORY,
3384                                             sizeof(IoCompletionInfo));
3385         RUNTIME_CHECK(lpo != NULL);
3386         lpo->acceptbuffer = (void *)HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY,
3387                 (sizeof(SOCKADDR_STORAGE) + 16) * 2);
3388         RUNTIME_CHECK(lpo->acceptbuffer != NULL);
3389
3390         lpo->adev = adev;
3391         lpo->request_type = SOCKET_ACCEPT;
3392
3393         ISCAcceptEx(sock->fd,
3394                     nsock->fd,                          /* Accepted Socket */
3395                     lpo->acceptbuffer,                  /* Buffer for initial Recv */
3396                     0,                                  /* Length of Buffer */
3397                     sizeof(SOCKADDR_STORAGE) + 16,              /* Local address length + 16 */
3398                     sizeof(SOCKADDR_STORAGE) + 16,              /* Remote address lengh + 16 */
3399                     (LPDWORD)&lpo->received_bytes,      /* Bytes Recved */
3400                     (LPOVERLAPPED)lpo                   /* Overlapped structure */
3401                     );
3402         iocompletionport_update(nsock);
3403
3404         socket_log(__LINE__, sock, NULL, TRACE,
3405                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND,
3406                    "accepting for nsock %p fd %d", nsock, nsock->fd);
3407
3408         /*
3409          * Enqueue the event
3410          */
3411         ISC_LIST_ENQUEUE(sock->accept_list, adev, ev_link);
3412         sock->pending_accept++;
3413         sock->pending_iocp++;
3414
3415         UNLOCK(&sock->lock);
3416         return (ISC_R_SUCCESS);
3417 }
3418
3419 isc_result_t
3420 isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
3421                     isc_task_t *task, isc_taskaction_t action, const void *arg)
3422 {
3423         char strbuf[ISC_STRERRORSIZE];
3424         isc_socket_connev_t *cdev;
3425         isc_task_t *ntask = NULL;
3426         isc_socketmgr_t *manager;
3427         IoCompletionInfo *lpo;
3428         int bind_errno;
3429
3430         REQUIRE(VALID_SOCKET(sock));
3431         REQUIRE(addr != NULL);
3432         REQUIRE(task != NULL);
3433         REQUIRE(action != NULL);
3434
3435         manager = sock->manager;
3436         REQUIRE(VALID_MANAGER(manager));
3437         REQUIRE(addr != NULL);
3438
3439         if (isc_sockaddr_ismulticast(addr))
3440                 return (ISC_R_MULTICAST);
3441
3442         LOCK(&sock->lock);
3443         CONSISTENT(sock);
3444
3445         /*
3446          * make sure that the socket's not closed
3447          */
3448         if (sock->fd == INVALID_SOCKET) {
3449                 UNLOCK(&sock->lock);
3450                 return (ISC_R_CONNREFUSED);
3451         }
3452
3453         /*
3454          * Windows sockets won't connect unless the socket is bound.
3455          */
3456         if (!sock->bound) {
3457                 isc_sockaddr_t any;
3458
3459                 isc_sockaddr_anyofpf(&any, isc_sockaddr_pf(addr));
3460                 if (bind(sock->fd, &any.type.sa, any.length) < 0) {
3461                         bind_errno = WSAGetLastError();
3462                         UNLOCK(&sock->lock);
3463                         switch (bind_errno) {
3464                         case WSAEACCES:
3465                                 return (ISC_R_NOPERM);
3466                         case WSAEADDRNOTAVAIL:
3467                                 return (ISC_R_ADDRNOTAVAIL);
3468                         case WSAEADDRINUSE:
3469                                 return (ISC_R_ADDRINUSE);
3470                         case WSAEINVAL:
3471                                 return (ISC_R_BOUND);
3472                         default:
3473                                 isc__strerror(bind_errno, strbuf,
3474                                               sizeof(strbuf));
3475                                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3476                                                  "bind: %s", strbuf);
3477                                 return (ISC_R_UNEXPECTED);
3478                         }
3479                 }
3480                 sock->bound = 1;
3481         }
3482
3483         REQUIRE(!sock->pending_connect);
3484
3485         cdev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
3486                                                         ISC_SOCKEVENT_CONNECT,
3487                                                         action, arg,
3488                                                         sizeof(*cdev));
3489         if (cdev == NULL) {
3490                 UNLOCK(&sock->lock);
3491                 return (ISC_R_NOMEMORY);
3492         }
3493         ISC_LINK_INIT(cdev, ev_link);
3494
3495         if (sock->type == isc_sockettype_tcp) {
3496                 /*
3497                  * Queue io completion for an accept().
3498                  */
3499                 lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3500                                                     HEAP_ZERO_MEMORY,
3501                                                     sizeof(IoCompletionInfo));
3502                 lpo->cdev = cdev;
3503                 lpo->request_type = SOCKET_CONNECT;
3504
3505                 sock->address = *addr;
3506                 ISCConnectEx(sock->fd, &addr->type.sa, addr->length,
3507                         NULL, 0, NULL, (LPOVERLAPPED)lpo);
3508
3509                 /*
3510                  * Attach to task.
3511                  */
3512                 isc_task_attach(task, &ntask);
3513                 cdev->ev_sender = ntask;
3514
3515                 sock->pending_connect = 1;
3516                 _set_state(sock, SOCK_CONNECT);
3517
3518                 /*
3519                  * Enqueue the request.
3520                  */
3521                 sock->connect_ev = cdev;
3522                 sock->pending_iocp++;
3523         } else {
3524                 WSAConnect(sock->fd, &addr->type.sa, addr->length, NULL, NULL, NULL, NULL);
3525                 cdev->result = ISC_R_SUCCESS;
3526                 isc_task_send(task, (isc_event_t **)&cdev);
3527         }
3528         CONSISTENT(sock);
3529         UNLOCK(&sock->lock);
3530
3531         return (ISC_R_SUCCESS);
3532 }
3533
3534 isc_result_t
3535 isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3536         isc_result_t result;
3537
3538         REQUIRE(VALID_SOCKET(sock));
3539         REQUIRE(addressp != NULL);
3540
3541         LOCK(&sock->lock);
3542         CONSISTENT(sock);
3543
3544         /*
3545          * make sure that the socket's not closed
3546          */
3547         if (sock->fd == INVALID_SOCKET) {
3548                 UNLOCK(&sock->lock);
3549                 return (ISC_R_CONNREFUSED);
3550         }
3551
3552         if (sock->connected) {
3553                 *addressp = sock->address;
3554                 result = ISC_R_SUCCESS;
3555         } else {
3556                 result = ISC_R_NOTCONNECTED;
3557         }
3558
3559         UNLOCK(&sock->lock);
3560
3561         return (result);
3562 }
3563
3564 isc_result_t
3565 isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3566         ISC_SOCKADDR_LEN_T len;
3567         isc_result_t result;
3568         char strbuf[ISC_STRERRORSIZE];
3569
3570         REQUIRE(VALID_SOCKET(sock));
3571         REQUIRE(addressp != NULL);
3572
3573         LOCK(&sock->lock);
3574         CONSISTENT(sock);
3575
3576         /*
3577          * make sure that the socket's not closed
3578          */
3579         if (sock->fd == INVALID_SOCKET) {
3580                 UNLOCK(&sock->lock);
3581                 return (ISC_R_CONNREFUSED);
3582         }
3583
3584         if (!sock->bound) {
3585                 result = ISC_R_NOTBOUND;
3586                 goto out;
3587         }
3588
3589         result = ISC_R_SUCCESS;
3590
3591         len = sizeof(addressp->type);
3592         if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
3593                 isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3594                 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
3595                                  strbuf);
3596                 result = ISC_R_UNEXPECTED;
3597                 goto out;
3598         }
3599         addressp->length = (unsigned int)len;
3600
3601  out:
3602         UNLOCK(&sock->lock);
3603
3604         return (result);
3605 }
3606
3607 /*
3608  * Run through the list of events on this socket, and cancel the ones
3609  * queued for task "task" of type "how".  "how" is a bitmask.
3610  */
3611 void
3612 isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
3613
3614         REQUIRE(VALID_SOCKET(sock));
3615
3616         /*
3617          * Quick exit if there is nothing to do.  Don't even bother locking
3618          * in this case.
3619          */
3620         if (how == 0)
3621                 return;
3622
3623         LOCK(&sock->lock);
3624         CONSISTENT(sock);
3625
3626         /*
3627          * make sure that the socket's not closed
3628          */
3629         if (sock->fd == INVALID_SOCKET) {
3630                 UNLOCK(&sock->lock);
3631                 return;
3632         }
3633
3634         /*
3635          * All of these do the same thing, more or less.
3636          * Each will:
3637          *      o If the internal event is marked as "posted" try to
3638          *        remove it from the task's queue.  If this fails, mark it
3639          *        as canceled instead, and let the task clean it up later.
3640          *      o For each I/O request for that task of that type, post
3641          *        its done event with status of "ISC_R_CANCELED".
3642          *      o Reset any state needed.
3643          */
3644
3645         if ((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV) {
3646                 isc_socketevent_t      *dev;
3647                 isc_socketevent_t      *next;
3648                 isc_task_t             *current_task;
3649
3650                 dev = ISC_LIST_HEAD(sock->recv_list);
3651                 while (dev != NULL) {
3652                         current_task = dev->ev_sender;
3653                         next = ISC_LIST_NEXT(dev, ev_link);
3654                         if ((task == NULL) || (task == current_task)) {
3655                                 dev->result = ISC_R_CANCELED;
3656                                 send_recvdone_event(sock, &dev);
3657                         }
3658                         dev = next;
3659                 }
3660         }
3661         how &= ~ISC_SOCKCANCEL_RECV;
3662
3663         if ((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND) {
3664                 isc_socketevent_t      *dev;
3665                 isc_socketevent_t      *next;
3666                 isc_task_t             *current_task;
3667
3668                 dev = ISC_LIST_HEAD(sock->send_list);
3669
3670                 while (dev != NULL) {
3671                         current_task = dev->ev_sender;
3672                         next = ISC_LIST_NEXT(dev, ev_link);
3673                         if ((task == NULL) || (task == current_task)) {
3674                                 dev->result = ISC_R_CANCELED;
3675                                 send_senddone_event(sock, &dev);
3676                         }
3677                         dev = next;
3678                 }
3679         }
3680         how &= ~ISC_SOCKCANCEL_SEND;
3681
3682         if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
3683             && !ISC_LIST_EMPTY(sock->accept_list)) {
3684                 isc_socket_newconnev_t *dev;
3685                 isc_socket_newconnev_t *next;
3686                 isc_task_t             *current_task;
3687
3688                 dev = ISC_LIST_HEAD(sock->accept_list);
3689                 while (dev != NULL) {
3690                         current_task = dev->ev_sender;
3691                         next = ISC_LIST_NEXT(dev, ev_link);
3692
3693                         if ((task == NULL) || (task == current_task)) {
3694
3695                                 dev->newsocket->references--;
3696                                 closesocket(dev->newsocket->fd);
3697                                 dev->newsocket->fd = INVALID_SOCKET;
3698                                 free_socket(&dev->newsocket, __LINE__);
3699
3700                                 dev->result = ISC_R_CANCELED;
3701                                 send_acceptdone_event(sock, &dev);
3702                         }
3703
3704                         dev = next;
3705                 }
3706         }
3707         how &= ~ISC_SOCKCANCEL_ACCEPT;
3708
3709         /*
3710          * Connecting is not a list.
3711          */
3712         if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
3713             && sock->connect_ev != NULL) {
3714                 isc_socket_connev_t    *dev;
3715                 isc_task_t             *current_task;
3716
3717                 INSIST(sock->pending_connect);
3718
3719                 dev = sock->connect_ev;
3720                 current_task = dev->ev_sender;
3721
3722                 if ((task == NULL) || (task == current_task)) {
3723                         closesocket(sock->fd);
3724                         sock->fd = INVALID_SOCKET;
3725                         _set_state(sock, SOCK_CLOSED);
3726
3727                         sock->connect_ev = NULL;
3728                         dev->result = ISC_R_CANCELED;
3729                         send_connectdone_event(sock, &dev);
3730                 }
3731         }
3732         how &= ~ISC_SOCKCANCEL_CONNECT;
3733
3734         maybe_free_socket(&sock, __LINE__);
3735 }
3736
3737 isc_sockettype_t
3738 isc__socket_gettype(isc_socket_t *sock) {
3739         isc_sockettype_t type;
3740
3741         REQUIRE(VALID_SOCKET(sock));
3742
3743         LOCK(&sock->lock);
3744
3745         /*
3746          * make sure that the socket's not closed
3747          */
3748         if (sock->fd == INVALID_SOCKET) {
3749                 UNLOCK(&sock->lock);
3750                 return (ISC_R_CONNREFUSED);
3751         }
3752
3753         type = sock->type;
3754         UNLOCK(&sock->lock);
3755         return (type);
3756 }
3757
3758 isc_boolean_t
3759 isc__socket_isbound(isc_socket_t *sock) {
3760         isc_boolean_t val;
3761
3762         REQUIRE(VALID_SOCKET(sock));
3763
3764         LOCK(&sock->lock);
3765         CONSISTENT(sock);
3766
3767         /*
3768          * make sure that the socket's not closed
3769          */
3770         if (sock->fd == INVALID_SOCKET) {
3771                 UNLOCK(&sock->lock);
3772                 return (ISC_FALSE);
3773         }
3774
3775         val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
3776         UNLOCK(&sock->lock);
3777
3778         return (val);
3779 }
3780
3781 void
3782 isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
3783 #if defined(IPV6_V6ONLY)
3784         int onoff = yes ? 1 : 0;
3785 #else
3786         UNUSED(yes);
3787 #endif
3788
3789         REQUIRE(VALID_SOCKET(sock));
3790
3791 #ifdef IPV6_V6ONLY
3792         if (sock->pf == AF_INET6) {
3793                 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
3794                                  (char *)&onoff, sizeof(onoff));
3795         }
3796 #endif
3797 }
3798
3799 void
3800 isc__socket_cleanunix(isc_sockaddr_t *addr, isc_boolean_t active) {
3801         UNUSED(addr);
3802         UNUSED(active);
3803 }
3804
3805 isc_result_t
3806 isc__socket_permunix(isc_sockaddr_t *addr, isc_uint32_t perm,
3807                      isc_uint32_t owner,        isc_uint32_t group)
3808 {
3809         UNUSED(addr);
3810         UNUSED(perm);
3811         UNUSED(owner);
3812         UNUSED(group);
3813         return (ISC_R_NOTIMPLEMENTED);
3814 }
3815
3816 void
3817 isc__socket_setname(isc_socket_t *socket, const char *name, void *tag) {
3818
3819         /*
3820          * Name 'socket'.
3821          */
3822
3823         REQUIRE(VALID_SOCKET(socket));
3824
3825         LOCK(&socket->lock);
3826         memset(socket->name, 0, sizeof(socket->name));
3827         strncpy(socket->name, name, sizeof(socket->name) - 1);
3828         socket->tag = tag;
3829         UNLOCK(&socket->lock);
3830 }
3831
3832 const char *
3833 isc__socket_getname(isc_socket_t *socket) {
3834         return (socket->name);
3835 }
3836
3837 void *
3838 isc__socket_gettag(isc_socket_t *socket) {
3839         return (socket->tag);
3840 }
3841
3842 int
3843 isc__socket_getfd(isc_socket_t *socket) {
3844         return ((short) socket->fd);
3845 }
3846
3847 void
3848 isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
3849         UNUSED(manager);
3850         UNUSED(reserved);
3851 }
3852
3853 void
3854 isc___socketmgr_maxudp(isc_socketmgr_t *manager, int maxudp) {
3855
3856         UNUSED(manager);
3857         UNUSED(maxudp);
3858 }
3859
3860 #ifdef HAVE_LIBXML2
3861
3862 static const char *
3863 _socktype(isc_sockettype_t type)
3864 {
3865         if (type == isc_sockettype_udp)
3866                 return ("udp");
3867         else if (type == isc_sockettype_tcp)
3868                 return ("tcp");
3869         else if (type == isc_sockettype_unix)
3870                 return ("unix");
3871         else if (type == isc_sockettype_fdwatch)
3872                 return ("fdwatch");
3873         else
3874                 return ("not-initialized");
3875 }
3876
3877 void
3878 isc_socketmgr_renderxml(isc_socketmgr_t *mgr, xmlTextWriterPtr writer)
3879 {
3880         isc_socket_t *sock;
3881         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
3882         isc_sockaddr_t addr;
3883         ISC_SOCKADDR_LEN_T len;
3884
3885         LOCK(&mgr->lock);
3886
3887 #ifndef ISC_PLATFORM_USETHREADS
3888         xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
3889         xmlTextWriterWriteFormatString(writer, "%d", mgr->refs);
3890         xmlTextWriterEndElement(writer);
3891 #endif
3892
3893         xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets");
3894         sock = ISC_LIST_HEAD(mgr->socklist);
3895         while (sock != NULL) {
3896                 LOCK(&sock->lock);
3897                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket");
3898
3899                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "id");
3900                 xmlTextWriterWriteFormatString(writer, "%p", sock);
3901                 xmlTextWriterEndElement(writer);
3902
3903                 if (sock->name[0] != 0) {
3904                         xmlTextWriterStartElement(writer, ISC_XMLCHAR "name");
3905                         xmlTextWriterWriteFormatString(writer, "%s",
3906                                                        sock->name);
3907                         xmlTextWriterEndElement(writer); /* name */
3908                 }
3909
3910                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
3911                 xmlTextWriterWriteFormatString(writer, "%d", sock->references);
3912                 xmlTextWriterEndElement(writer);
3913
3914                 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
3915                                           ISC_XMLCHAR _socktype(sock->type));
3916
3917                 if (sock->connected) {
3918                         isc_sockaddr_format(&sock->address, peerbuf,
3919                                             sizeof(peerbuf));
3920                         xmlTextWriterWriteElement(writer,
3921                                                   ISC_XMLCHAR "peer-address",
3922                                                   ISC_XMLCHAR peerbuf);
3923                 }
3924
3925                 len = sizeof(addr);
3926                 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
3927                         isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
3928                         xmlTextWriterWriteElement(writer,
3929                                                   ISC_XMLCHAR "local-address",
3930                                                   ISC_XMLCHAR peerbuf);
3931                 }
3932
3933                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "states");
3934                 if (sock->pending_recv)
3935                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3936                                                 ISC_XMLCHAR "pending-receive");
3937                 if (sock->pending_send)
3938                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3939                                                   ISC_XMLCHAR "pending-send");
3940                 if (sock->pending_accept)
3941                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3942                                                  ISC_XMLCHAR "pending_accept");
3943                 if (sock->listener)
3944                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3945                                                   ISC_XMLCHAR "listener");
3946                 if (sock->connected)
3947                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3948                                                   ISC_XMLCHAR "connected");
3949                 if (sock->pending_connect)
3950                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3951                                                   ISC_XMLCHAR "connecting");
3952                 if (sock->bound)
3953                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3954                                                   ISC_XMLCHAR "bound");
3955
3956                 xmlTextWriterEndElement(writer); /* states */
3957
3958                 xmlTextWriterEndElement(writer); /* socket */
3959
3960                 UNLOCK(&sock->lock);
3961                 sock = ISC_LIST_NEXT(sock, link);
3962         }
3963         xmlTextWriterEndElement(writer); /* sockets */
3964
3965         UNLOCK(&mgr->lock);
3966 }
3967 #endif /* HAVE_LIBXML2 */