share/doc/psd/21.ipc/5.t

   1 .\" Copyright (c) 1986, 1993
   2 .\"     The Regents of the University of California.  All rights reserved.
   3 .\"
   4 .\" Redistribution and use in source and binary forms, with or without
   5 .\" modification, are permitted provided that the following conditions
   6 .\" are met:
   7 .\" 1. Redistributions of source code must retain the above copyright
   8 .\"    notice, this list of conditions and the following disclaimer.
   9 .\" 2. Redistributions in binary form must reproduce the above copyright
  10 .\"    notice, this list of conditions and the following disclaimer in the
  11 .\"    documentation and/or other materials provided with the distribution.
  12 .\" 3. Neither the name of the University nor the names of its contributors
  13 .\"    may be used to endorse or promote products derived from this software
  14 .\"    without specific prior written permission.
  15 .\"
  16 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  17 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  20 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26 .\" SUCH DAMAGE.
  27 .\"
  28 .\"     @(#)5.t 8.1 (Berkeley) 8/14/93
  29 .\" $FreeBSD$
  30 .\"
  31 .\".ds RH "Advanced Topics
  32 .bp
  33 .nr H1 5
  34 .nr H2 0
  35 .LG
  36 .B
  37 .ce
  38 5. ADVANCED TOPICS
  39 .sp 2
  40 .R
  41 .NL
  42 .PP
  43 A number of facilities have yet to be discussed.  For most users
  44 of the IPC the mechanisms already
  45 described will suffice in constructing distributed
  46 applications.  However, others will find the need to utilize some
  47 of the features which we consider in this section.
  48 .NH 2
  49 Out of band data
  50 .PP
  51 The stream socket abstraction includes the notion of \*(lqout
  52 of band\*(rq data.  Out of band data is a logically independent
  53 transmission channel associated with each pair of connected
  54 stream sockets.  Out of band data is delivered to the user
  55 independently of normal data.
  56 The abstraction defines that the out of band data facilities
  57 must support the reliable delivery of at least one
  58 out of band message at a time.  This message may contain at least one
  59 byte of data, and at least one message may be pending delivery
  60 to the user at any one time.  For communications protocols which
  61 support only in-band signaling (i.e. the urgent data is
  62 delivered in sequence with the normal data), the system normally extracts
  63 the data from the normal data stream and stores it separately.
  64 This allows users to choose between receiving the urgent data
  65 in order and receiving it out of sequence without having to
  66 buffer all the intervening data.  It is possible
  67 to ``peek'' (via MSG_PEEK) at out of band data.
  68 If the socket has a process group, a SIGURG signal is generated
  69 when the protocol is notified of its existence.
  70 A process can set the process group
  71 or process id to be informed by the SIGURG signal via the
  72 appropriate \fIfcntl\fP call, as described below for
  73 SIGIO.
  74 If multiple sockets may have out of band data awaiting
  75 delivery, a \fIselect\fP call for exceptional conditions
  76 may be used to determine those sockets with such data pending.
  77 Neither the signal nor the select indicate the actual arrival
  78 of the out-of-band data, but only notification that it is pending.
  79 .PP
  80 In addition to the information passed, a logical mark is placed in
  81 the data stream to indicate the point at which the out
  82 of band data was sent.  The remote login and remote shell
  83 applications use this facility to propagate signals between
  84 client and server processes.  When a signal
  85 flushs any pending output from the remote process(es), all
  86 data up to the mark in the data stream is discarded.
  87 .PP
  88 To send an out of band message the MSG_OOB flag is supplied to
  89 a \fIsend\fP or \fIsendto\fP calls,
  90 while to receive out of band data MSG_OOB should be indicated
  91 when performing a \fIrecvfrom\fP or \fIrecv\fP call.
  92 To find out if the read pointer is currently pointing at
  93 the mark in the data stream, the SIOCATMARK ioctl is provided:
  94 .DS
  95 ioctl(s, SIOCATMARK, &yes);
  96 .DE
  97 If \fIyes\fP is a 1 on return, the next read will return data
  98 after the mark.  Otherwise (assuming out of band data has arrived),
  99 the next read will provide data sent by the client prior
 100 to transmission of the out of band signal.  The routine used
 101 in the remote login process to flush output on receipt of an
 102 interrupt or quit signal is shown in Figure 5.
 103 It reads the normal data up to the mark (to discard it),
 104 then reads the out-of-band byte.
 105 .KF
 106 .DS
 107 #include <sys/ioctl.h>
 108 #include <sys/file.h>
 109  ...
 110 oob()
 111 {
 112         int out = FWRITE, mark;
 113         char waste[BUFSIZ];
 114
 115         /* flush local terminal output */
 116         ioctl(1, TIOCFLUSH, (char *)&out);
 117         for (;;) {
 118                 if (ioctl(rem, SIOCATMARK, &mark) < 0) {
 119                         perror("ioctl");
 120                         break;
 121                 }
 122                 if (mark)
 123                         break;
 124                 (void) read(rem, waste, sizeof (waste));
 125         }
 126         if (recv(rem, &mark, 1, MSG_OOB) < 0) {
 127                 perror("recv");
 128                 ...
 129         }
 130         ...
 131 }
 132 .DE
 133 .ce
 134 Figure 5.  Flushing terminal I/O on receipt of out of band data.
 135 .sp
 136 .KE
 137 .PP
 138 A process may also read or peek at the out-of-band data
 139 without first reading up to the mark.
 140 This is more difficult when the underlying protocol delivers
 141 the urgent data in-band with the normal data, and only sends
 142 notification of its presence ahead of time (e.g., the TCP protocol
 143 used to implement streams in the Internet domain).
 144 With such protocols, the out-of-band byte may not yet have arrived
 145 when a \fIrecv\fP is done with the MSG_OOB flag.
 146 In that case, the call will return an error of EWOULDBLOCK.
 147 Worse, there may be enough in-band data in the input buffer
 148 that normal flow control prevents the peer from sending the urgent data
 149 until the buffer is cleared.
 150 The process must then read enough of the queued data
 151 that the urgent data may be delivered.
 152 .PP
 153 Certain programs that use multiple bytes of urgent data and must
 154 handle multiple urgent signals (e.g., \fItelnet\fP\|(1C))
 155 need to retain the position of urgent data within the stream.
 156 This treatment is available as a socket-level option, SO_OOBINLINE;
 157 see \fIsetsockopt\fP\|(2) for usage.
 158 With this option, the position of urgent data (the \*(lqmark\*(rq)
 159 is retained, but the urgent data immediately follows the mark
 160 within the normal data stream returned without the MSG_OOB flag.
 161 Reception of multiple urgent indications causes the mark to move,
 162 but no out-of-band data are lost.
 163 .NH 2
 164 Non-Blocking Sockets
 165 .PP
 166 It is occasionally convenient to make use of sockets
 167 which do not block; that is, I/O requests which
 168 cannot complete immediately and
 169 would therefore cause the process to be suspended awaiting completion are
 170 not executed, and an error code is returned.
 171 Once a socket has been created via
 172 the \fIsocket\fP call, it may be marked as non-blocking
 173 by \fIfcntl\fP as follows:
 174 .DS
 175 #include <fcntl.h>
 176  ...
 177 int     s;
 178  ...
 179 s = socket(AF_INET, SOCK_STREAM, 0);
 180  ...
 181 if (fcntl(s, F_SETFL, FNDELAY) < 0)
 182         perror("fcntl F_SETFL, FNDELAY");
 183         exit(1);
 184 }
 185  ...
 186 .DE
 187 .PP
 188 When performing non-blocking I/O on sockets, one must be
 189 careful to check for the error EWOULDBLOCK (stored in the
 190 global variable \fIerrno\fP), which occurs when
 191 an operation would normally block, but the socket it
 192 was performed on is marked as non-blocking.
 193 In particular, \fIaccept\fP, \fIconnect\fP, \fIsend\fP, \fIrecv\fP,
 194 \fIread\fP, and \fIwrite\fP can
 195 all return EWOULDBLOCK, and processes should be prepared
 196 to deal with such return codes.
 197 If an operation such as a \fIsend\fP cannot be done in its entirety,
 198 but partial writes are sensible (for example, when using a stream socket),
 199 the data that can be sent immediately will be processed,
 200 and the return value will indicate the amount actually sent.
 201 .NH 2
 202 Interrupt driven socket I/O
 203 .PP
 204 The SIGIO signal allows a process to be notified
 205 via a signal when a socket (or more generally, a file
 206 descriptor) has data waiting to be read.  Use of
 207 the SIGIO facility requires three steps:  First,
 208 the process must set up a SIGIO signal handler
 209 by use of the \fIsignal\fP or \fIsigvec\fP calls.  Second,
 210 it must set the process id or process group id which is to receive
 211 notification of pending input to its own process id,
 212 or the process group id of its process group (note that
 213 the default process group of a socket is group zero).
 214 This is accomplished by use of an \fIfcntl\fP call.
 215 Third, it must enable asynchronous notification of pending I/O requests
 216 with another \fIfcntl\fP call.  Sample code to
 217 allow a given process to receive information on
 218 pending I/O requests as they occur for a socket \fIs\fP
 219 is given in Figure 6.  With the addition of a handler for SIGURG,
 220 this code can also be used to prepare for receipt of SIGURG signals.
 221 .KF
 222 .DS
 223 #include <fcntl.h>
 224  ...
 225 int     io_handler();
 226  ...
 227 signal(SIGIO, io_handler);
 228
 229 /* Set the process receiving SIGIO/SIGURG signals to us */
 230
 231 if (fcntl(s, F_SETOWN, getpid()) < 0) {
 232         perror("fcntl F_SETOWN");
 233         exit(1);
 234 }
 235
 236 /* Allow receipt of asynchronous I/O signals */
 237
 238 if (fcntl(s, F_SETFL, FASYNC) < 0) {
 239         perror("fcntl F_SETFL, FASYNC");
 240         exit(1);
 241 }
 242 .DE
 243 .ce
 244 Figure 6.  Use of asynchronous notification of I/O requests.
 245 .sp
 246 .KE
 247 .NH 2
 248 Signals and process groups
 249 .PP
 250 Due to the existence of the SIGURG and SIGIO signals each socket has an
 251 associated process number, just as is done for terminals.
 252 This value is initialized to zero,
 253 but may be redefined at a later time with the F_SETOWN
 254 \fIfcntl\fP, such as was done in the code above for SIGIO.
 255 To set the socket's process id for signals, positive arguments
 256 should be given to the \fIfcntl\fP call.  To set the socket's
 257 process group for signals, negative arguments should be
 258 passed to \fIfcntl\fP.  Note that the process number indicates
 259 either the associated process id or the associated process
 260 group; it is impossible to specify both at the same time.
 261 A similar \fIfcntl\fP, F_GETOWN, is available for determining the
 262 current process number of a socket.
 263 .PP
 264 Another signal which is useful when constructing server processes
 265 is SIGCHLD.  This signal is delivered to a process when any
 266 child processes have changed state.  Normally servers use
 267 the signal to \*(lqreap\*(rq child processes that have exited
 268 without explicitly awaiting their termination
 269 or periodic polling for exit status.
 270 For example, the remote login server loop shown in Figure 2
 271 may be augmented as shown in Figure 7.
 272 .KF
 273 .DS
 274 int reaper();
 275  ...
 276 signal(SIGCHLD, reaper);
 277 listen(f, 5);
 278 for (;;) {
 279         int g, len = sizeof (from);
 280
 281         g = accept(f, (struct sockaddr *)&from, &len,);
 282         if (g < 0) {
 283                 if (errno != EINTR)
 284                         syslog(LOG_ERR, "rlogind: accept: %m");
 285                 continue;
 286         }
 287         ...
 288 }
 289  ...
 290 #include <wait.h>
 291 reaper()
 292 {
 293         union wait status;
 294
 295         while (wait3(&status, WNOHANG, 0) > 0)
 296                 ;
 297 }
 298 .DE
 299 .sp
 300 .ce
 301 Figure 7.  Use of the SIGCHLD signal.
 302 .sp
 303 .KE
 304 .PP
 305 If the parent server process fails to reap its children,
 306 a large number of \*(lqzombie\*(rq processes may be created.
 307 .NH 2
 308 Pseudo terminals
 309 .PP
 310 Many programs will not function properly without a terminal
 311 for standard input and output.  Since sockets do not provide
 312 the semantics of terminals,
 313 it is often necessary to have a process communicating over
 314 the network do so through a \fIpseudo-terminal\fP.  A pseudo-
 315 terminal is actually a pair of devices, master and slave,
 316 which allow a process to serve as an active agent in communication
 317 between processes and users.  Data written on the slave side
 318 of a pseudo-terminal is supplied as input to a process reading
 319 from the master side, while data written on the master side are
 320 processed as terminal input for the slave.
 321 In this way, the process manipulating
 322 the master side of the pseudo-terminal has control over the
 323 information read and written on the slave side
 324 as if it were manipulating the keyboard and reading the screen
 325 on a real terminal.
 326 The purpose of this abstraction is to
 327 preserve terminal semantics over a network connection\(em
 328 that is, the slave side appears as a normal terminal to
 329 any process reading from or writing to it.
 330 .PP
 331 For example, the remote
 332 login server uses pseudo-terminals for remote login sessions.
 333 A user logging in to a machine across the network is provided
 334 a shell with a slave pseudo-terminal as standard input, output,
 335 and error.  The server process then handles the communication
 336 between the programs invoked by the remote shell and the user's
 337 local client process.
 338 When a user sends a character that generates an interrupt
 339 on the remote machine that flushes terminal output,
 340 the pseudo-terminal generates a control message for the server process.
 341 The server then sends an out of band message
 342 to the client process to signal a flush of data at the real terminal
 343 and on the intervening data buffered in the network.
 344 .PP
 345 Under 4.4BSD, the name of the slave side of a pseudo-terminal is of the form
 346 \fI/dev/ttyxy\fP, where \fIx\fP is a single letter
 347 starting at `p' and continuing to `t'.
 348 \fIy\fP is a hexadecimal digit (i.e., a single
 349 character in the range 0 through 9 or `a' through `f').
 350 The master side of a pseudo-terminal is \fI/dev/ptyxy\fP,
 351 where \fIx\fP and \fIy\fP correspond to the
 352 slave side of the pseudo-terminal.
 353 .PP
 354 In general, the method of obtaining a pair of master and
 355 slave pseudo-terminals is to
 356 find a pseudo-terminal which
 357 is not currently in use.
 358 The master half of a pseudo-terminal is a single-open device;
 359 thus, each master may be opened in turn until an open succeeds.
 360 The slave side of the pseudo-terminal is then opened,
 361 and is set to the proper terminal modes if necessary.
 362 The process then \fIfork\fPs; the child closes
 363 the master side of the pseudo-terminal, and \fIexec\fPs the
 364 appropriate program.  Meanwhile, the parent closes the
 365 slave side of the pseudo-terminal and begins reading and
 366 writing from the master side.  Sample code making use of
 367 pseudo-terminals is given in Figure 8; this code assumes
 368 that a connection on a socket \fIs\fP exists, connected
 369 to a peer who wants a service of some kind, and that the
 370 process has disassociated itself from any previous controlling terminal.
 371 .KF
 372 .DS
 373 gotpty = 0;
 374 for (c = 'p'; !gotpty && c <= 's'; c++) {
 375         line = "/dev/ptyXX";
 376         line[sizeof("/dev/pty")-1] = c;
 377         line[sizeof("/dev/ptyp")-1] = '0';
 378         if (stat(line, &statbuf) < 0)
 379                 break;
 380         for (i = 0; i < 16; i++) {
 381                 line[sizeof("/dev/ptyp")-1] = "0123456789abcdef"[i];
 382                 master = open(line, O_RDWR);
 383                 if (master > 0) {
 384                         gotpty = 1;
 385                         break;
 386                 }
 387         }
 388 }
 389 if (!gotpty) {
 390         syslog(LOG_ERR, "All network ports in use");
 391         exit(1);
 392 }
 393
 394 line[sizeof("/dev/")-1] = 't';
 395 slave = open(line, O_RDWR);     /* \fIslave\fP is now slave side */
 396 if (slave < 0) {
 397         syslog(LOG_ERR, "Cannot open slave pty %s", line);
 398         exit(1);
 399 }
 400
 401 ioctl(slave, TIOCGETP, &b);     /* Set slave tty modes */
 402 b.sg_flags = CRMOD|XTABS|ANYP;
 403 ioctl(slave, TIOCSETP, &b);
 404
 405 i = fork();
 406 if (i < 0) {
 407         syslog(LOG_ERR, "fork: %m");
 408         exit(1);
 409 } else if (i) {         /* Parent */
 410         close(slave);
 411         ...
 412 } else {                 /* Child */
 413         (void) close(s);
 414         (void) close(master);
 415         dup2(slave, 0);
 416         dup2(slave, 1);
 417         dup2(slave, 2);
 418         if (slave > 2)
 419                 (void) close(slave);
 420         ...
 421 }
 422 .DE
 423 .ce
 424 Figure 8.  Creation and use of a pseudo terminal
 425 .sp
 426 .KE
 427 .NH 2
 428 Selecting specific protocols
 429 .PP
 430 If the third argument to the \fIsocket\fP call is 0,
 431 \fIsocket\fP will select a default protocol to use with
 432 the returned socket of the type requested.
 433 The default protocol is usually correct, and alternate choices are not
 434 usually available.
 435 However, when using ``raw'' sockets to communicate directly with
 436 lower-level protocols or hardware interfaces,
 437 the protocol argument may be important for setting up demultiplexing.
 438 For example, raw sockets in the Internet family may be used to implement
 439 a new protocol above IP, and the socket will receive packets
 440 only for the protocol specified.
 441 To obtain a particular protocol one determines the protocol number
 442 as defined within the communication domain.  For the Internet
 443 domain one may use one of the library routines
 444 discussed in section 3, such as \fIgetprotobyname\fP:
 445 .DS
 446 #include <sys/types.h>
 447 #include <sys/socket.h>
 448 #include <netinet/in.h>
 449 #include <netdb.h>
 450  ...
 451 pp = getprotobyname("newtcp");
 452 s = socket(AF_INET, SOCK_STREAM, pp->p_proto);
 453 .DE
 454 This would result in a socket \fIs\fP using a stream
 455 based connection, but with protocol type of ``newtcp''
 456 instead of the default ``tcp.''
 457 .PP
 458 In the NS domain, the available socket protocols are defined in
 459 <\fInetns/ns.h\fP>.  To create a raw socket for Xerox Error Protocol
 460 messages, one might use:
 461 .DS
 462 #include <sys/types.h>
 463 #include <sys/socket.h>
 464 #include <netns/ns.h>
 465  ...
 466 s = socket(AF_NS, SOCK_RAW, NSPROTO_ERROR);
 467 .DE
 468 .NH 2
 469 Address binding
 470 .PP
 471 As was mentioned in section 2,
 472 binding addresses to sockets in the Internet and NS domains can be
 473 fairly complex.  As a brief reminder, these associations
 474 are composed of local and foreign
 475 addresses, and local and foreign ports.  Port numbers are
 476 allocated out of separate spaces, one for each system and one
 477 for each domain on that system.
 478 Through the \fIbind\fP system call, a
 479 process may specify half of an association, the
 480 <local address, local port> part, while the
 481 \fIconnect\fP
 482 and \fIaccept\fP
 483 primitives are used to complete a socket's association by
 484 specifying the <foreign address, foreign port> part.
 485 Since the association is created in two steps the association
 486 uniqueness requirement indicated previously could be violated unless
 487 care is taken.  Further, it is unrealistic to expect user
 488 programs to always know proper values to use for the local address
 489 and local port since a host may reside on multiple networks and
 490 the set of allocated port numbers is not directly accessible
 491 to a user.
 492 .PP
 493 To simplify local address binding in the Internet domain the notion of a
 494 \*(lqwildcard\*(rq address has been provided.  When an address
 495 is specified as INADDR_ANY (a manifest constant defined in
 496 <netinet/in.h>), the system interprets the address as
 497 \*(lqany valid address\*(rq.  For example, to bind a specific
 498 port number to a socket, but leave the local address unspecified,
 499 the following code might be used:
 500 .DS
 501 #include <sys/types.h>
 502 #include <netinet/in.h>
 503  ...
 504 struct sockaddr_in sin;
 505  ...
 506 s = socket(AF_INET, SOCK_STREAM, 0);
 507 sin.sin_family = AF_INET;
 508 sin.sin_addr.s_addr = htonl(INADDR_ANY);
 509 sin.sin_port = htons(MYPORT);
 510 bind(s, (struct sockaddr *) &sin, sizeof (sin));
 511 .DE
 512 Sockets with wildcarded local addresses may receive messages
 513 directed to the specified port number, and sent to any
 514 of the possible addresses assigned to a host.  For example,
 515 if a host has addresses 128.32.0.4 and 10.0.0.78, and a socket is bound as
 516 above, the process will be
 517 able to accept connection requests which are addressed to
 518 128.32.0.4 or 10.0.0.78.
 519 If a server process wished to only allow hosts on a
 520 given network connect to it, it would bind
 521 the address of the host on the appropriate network.
 522 .PP
 523 In a similar fashion, a local port may be left unspecified
 524 (specified as zero), in which case the system will select an
 525 appropriate port number for it.  This shortcut will work
 526 both in the Internet and NS domains.  For example, to
 527 bind a specific local address to a socket, but to leave the
 528 local port number unspecified:
 529 .DS
 530 hp = gethostbyname(hostname);
 531 if (hp == NULL) {
 532         ...
 533 }
 534 bcopy(hp->h_addr, (char *) sin.sin_addr, hp->h_length);
 535 sin.sin_port = htons(0);
 536 bind(s, (struct sockaddr *) &sin, sizeof (sin));
 537 .DE
 538 The system selects the local port number based on two criteria.
 539 The first is that on 4BSD systems,
 540 Internet ports below IPPORT_RESERVED (1024) (for the Xerox domain,
 541 0 through 3000) are reserved
 542 for privileged users (i.e., the super user);
 543 Internet ports above IPPORT_USERRESERVED (50000) are reserved
 544 for non-privileged servers.  The second is
 545 that the port number is not currently bound to some other
 546 socket.  In order to find a free Internet port number in the privileged
 547 range the \fIrresvport\fP library routine may be used as follows
 548 to return a stream socket in with a privileged port number:
 549 .DS
 550 int lport = IPPORT_RESERVED \- 1;
 551 int s;
 552 \&...
 553 s = rresvport(&lport);
 554 if (s < 0) {
 555         if (errno == EAGAIN)
 556                 fprintf(stderr, "socket: all ports in use\en");
 557         else
 558                 perror("rresvport: socket");
 559         ...
 560 }
 561 .DE
 562 The restriction on allocating ports was done to allow processes
 563 executing in a \*(lqsecure\*(rq environment to perform authentication
 564 based on the originating address and port number.  For example,
 565 the \fIrlogin\fP(1) command allows users to log in across a network
 566 without being asked for a password, if two conditions hold:
 567 First, the name of the system the user
 568 is logging in from is in the file
 569 \fI/etc/hosts.equiv\fP on the system he is logging
 570 in to (or the system name and the user name are in
 571 the user's \fI.rhosts\fP file in the user's home
 572 directory), and second, that the user's rlogin
 573 process is coming from a privileged port on the machine from which he is
 574 logging.  The port number and network address of the
 575 machine from which the user is logging in can be determined either
 576 by the \fIfrom\fP result of the \fIaccept\fP call, or
 577 from the \fIgetpeername\fP call.
 578 .PP
 579 In certain cases the algorithm used by the system in selecting
 580 port numbers is unsuitable for an application.  This is because
 581 associations are created in a two step process.  For example,
 582 the Internet file transfer protocol, FTP, specifies that data
 583 connections must always originate from the same local port.  However,
 584 duplicate associations are avoided by connecting to different foreign
 585 ports.  In this situation the system would disallow binding the
 586 same local address and port number to a socket if a previous data
 587 connection's socket still existed.  To override the default port
 588 selection algorithm, an option call must be performed prior
 589 to address binding:
 590 .DS
 591  ...
 592 int     on = 1;
 593  ...
 594 setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
 595 bind(s, (struct sockaddr *) &sin, sizeof (sin));
 596 .DE
 597 With the above call, local addresses may be bound which
 598 are already in use.  This does not violate the uniqueness
 599 requirement as the system still checks at connect time to
 600 be sure any other sockets with the same local address and
 601 port do not have the same foreign address and port.
 602 If the association already exists, the error EADDRINUSE is returned.
 603 A related socket option, SO_REUSEPORT, which allows completely
 604 duplicate bindings, is described in the IP multicasting section.
 605 .NH 2
 606 Socket Options
 607 .PP
 608 It is possible to set and get a number of options on sockets
 609 via the \fIsetsockopt\fP and \fIgetsockopt\fP system calls.
 610 These options include such things as marking a socket for
 611 broadcasting, not to route, to linger on close, etc.
 612 In addition, there are protocol-specific options for IP and TCP,
 613 as described in
 614 .IR ip (4),
 615 .IR tcp (4),
 616 and in the section on multicasting below.
 617 .PP
 618 The general forms of the calls are:
 619 .DS
 620 setsockopt(s, level, optname, optval, optlen);
 621 .DE
 622 and
 623 .DS
 624 getsockopt(s, level, optname, optval, optlen);
 625 .DE
 626 .PP
 627 The parameters to the calls are as follows: \fIs\fP
 628 is the socket on which the option is to be applied.
 629 \fILevel\fP specifies the protocol layer on which the
 630 option is to be applied; in most cases this is
 631 the ``socket level'', indicated by the symbolic constant
 632 SOL_SOCKET, defined in \fI<sys/socket.h>.\fP
 633 The actual option is specified in \fIoptname\fP, and is
 634 a symbolic constant also defined in \fI<sys/socket.h>\fP.
 635 \fIOptval\fP and \fIOptlen\fP point to the value of the
 636 option (in most cases, whether the option is to be turned
 637 on or off), and the length of the value of the option,
 638 respectively.
 639 For \fIgetsockopt\fP, \fIoptlen\fP is
 640 a value-result parameter, initially set to the size of
 641 the storage area pointed to by \fIoptval\fP, and modified
 642 upon return to indicate the actual amount of storage used.
 643 .PP
 644 An example should help clarify things.  It is sometimes
 645 useful to determine the type (e.g., stream, datagram, etc.)
 646 of an existing socket; programs
 647 under \fIinetd\fP (described below) may need to perform this
 648 task.  This can be accomplished as follows via the
 649 SO_TYPE socket option and the \fIgetsockopt\fP call:
 650 .DS
 651 #include <sys/types.h>
 652 #include <sys/socket.h>
 653
 654 int type, size;
 655
 656 size = sizeof (int);
 657
 658 if (getsockopt(s, SOL_SOCKET, SO_TYPE, (char *) &type, &size) < 0) {
 659         ...
 660 }
 661 .DE
 662 After the \fIgetsockopt\fP call, \fItype\fP will be set
 663 to the value of the socket type, as defined in
 664 \fI<sys/socket.h>\fP.  If, for example, the socket were
 665 a datagram socket, \fItype\fP would have the value
 666 corresponding to SOCK_DGRAM.
 667 .NH 2
 668 Broadcasting and determining network configuration
 669 .PP
 670 By using a datagram socket, it is possible to send broadcast
 671 packets on many networks supported by the system.
 672 The network itself must support broadcast; the system
 673 provides no simulation of broadcast in software.
 674 Broadcast messages can place a high load on a network since they force
 675 every host on the network to service them.  Consequently,
 676 the ability to send broadcast packets has been limited
 677 to sockets which are explicitly marked as allowing broadcasting.
 678 Broadcast is typically used for one of two reasons:
 679 it is desired to find a resource on a local network without prior
 680 knowledge of its address,
 681 or important functions such as routing require that information
 682 be sent to all accessible neighbors.
 683 .PP
 684 Multicasting is an alternative to broadcasting.
 685 Setting up IP multicast sockets is described in the next section.
 686 .PP
 687 To send a broadcast message, a datagram socket
 688 should be created:
 689 .DS
 690 s = socket(AF_INET, SOCK_DGRAM, 0);
 691 .DE
 692 or
 693 .DS
 694 s = socket(AF_NS, SOCK_DGRAM, 0);
 695 .DE
 696 The socket is marked as allowing broadcasting,
 697 .DS
 698 int     on = 1;
 699
 700 setsockopt(s, SOL_SOCKET, SO_BROADCAST, &on, sizeof (on));
 701 .DE
 702 and at least a port number should be bound to the socket:
 703 .DS
 704 sin.sin_family = AF_INET;
 705 sin.sin_addr.s_addr = htonl(INADDR_ANY);
 706 sin.sin_port = htons(MYPORT);
 707 bind(s, (struct sockaddr *) &sin, sizeof (sin));
 708 .DE
 709 or, for the NS domain,
 710 .DS
 711 sns.sns_family = AF_NS;
 712 netnum = htonl(net);
 713 sns.sns_addr.x_net = *(union ns_net *) &netnum; /* insert net number */
 714 sns.sns_addr.x_port = htons(MYPORT);
 715 bind(s, (struct sockaddr *) &sns, sizeof (sns));
 716 .DE
 717 The destination address of the message to be broadcast
 718 depends on the network(s) on which the message is to be broadcast.
 719 The Internet domain supports a shorthand notation for broadcast
 720 on the local network, the address INADDR_BROADCAST (defined in
 721 <\fInetinet/in.h\fP>.
 722 To determine the list of addresses for all reachable neighbors
 723 requires knowledge of the networks to which the host is connected.
 724 Since this information should
 725 be obtained in a host-independent fashion and may be impossible
 726 to derive, 4.4BSD provides a method of
 727 retrieving this information from the system data structures.
 728 The SIOCGIFCONF \fIioctl\fP call returns the interface
 729 configuration of a host in the form of a
 730 single \fIifconf\fP structure; this structure contains
 731 a ``data area'' which is made up of an array of
 732 of \fIifreq\fP structures, one for each network interface
 733 to which the host is connected.
 734 These structures are defined in
 735 \fI<net/if.h>\fP as follows:
 736 .DS
 737 .if t .ta .5i 1.0i 1.5i 3.5i
 738 .if n .ta .7i 1.4i 2.1i 3.4i
 739 struct ifconf {
 740         int     ifc_len;                /* size of associated buffer */
 741         union {
 742                 caddr_t ifcu_buf;
 743                 struct  ifreq *ifcu_req;
 744         } ifc_ifcu;
 745 };
 746
 747 #define ifc_buf ifc_ifcu.ifcu_buf               /* buffer address */
 748 #define ifc_req ifc_ifcu.ifcu_req               /* array of structures returned */
 749
 750 #define IFNAMSIZ        16
 751
 752 struct ifreq {
 753         char    ifr_name[IFNAMSIZ];             /* if name, e.g. "en0" */
 754         union {
 755                 struct  sockaddr ifru_addr;
 756                 struct  sockaddr ifru_dstaddr;
 757                 struct  sockaddr ifru_broadaddr;
 758                 short   ifru_flags;
 759                 caddr_t ifru_data;
 760         } ifr_ifru;
 761 };
 762
 763 .if t .ta \w'  #define'u +\w'  ifr_broadaddr'u +\w'  ifr_ifru.ifru_broadaddr'u
 764 #define ifr_addr        ifr_ifru.ifru_addr      /* address */
 765 #define ifr_dstaddr     ifr_ifru.ifru_dstaddr   /* other end of p-to-p link */
 766 #define ifr_broadaddr   ifr_ifru.ifru_broadaddr /* broadcast address */
 767 #define ifr_flags       ifr_ifru.ifru_flags     /* flags */
 768 #define ifr_data        ifr_ifru.ifru_data      /* for use by interface */
 769 .DE
 770 The actual call which obtains the
 771 interface configuration is
 772 .DS
 773 struct ifconf ifc;
 774 char buf[BUFSIZ];
 775
 776 ifc.ifc_len = sizeof (buf);
 777 ifc.ifc_buf = buf;
 778 if (ioctl(s, SIOCGIFCONF, (char *) &ifc) < 0) {
 779         ...
 780 }
 781 .DE
 782 After this call \fIbuf\fP will contain one \fIifreq\fP structure for
 783 each network to which the host is connected, and
 784 \fIifc.ifc_len\fP will have been modified to reflect the number
 785 of bytes used by the \fIifreq\fP structures.
 786 .PP
 787 For each structure
 788 there exists a set of ``interface flags'' which tell
 789 whether the network corresponding to that interface is
 790 up or down, point to point or broadcast, etc.  The
 791 SIOCGIFFLAGS \fIioctl\fP retrieves these
 792 flags for an interface specified by an \fIifreq\fP
 793 structure as follows:
 794 .DS
 795 struct ifreq *ifr;
 796
 797 ifr = ifc.ifc_req;
 798
 799 for (n = ifc.ifc_len / sizeof (struct ifreq); --n >= 0; ifr++) {
 800         /*
 801          * We must be careful that we don't use an interface
 802          * devoted to an address family other than those intended;
 803          * if we were interested in NS interfaces, the
 804          * AF_INET would be AF_NS.
 805          */
 806         if (ifr->ifr_addr.sa_family != AF_INET)
 807                 continue;
 808         if (ioctl(s, SIOCGIFFLAGS, (char *) ifr) < 0) {
 809                 ...
 810         }
 811         /*
 812          * Skip boring cases.
 813          */
 814         if ((ifr->ifr_flags & IFF_UP) == 0 ||
 815             (ifr->ifr_flags & IFF_LOOPBACK) ||
 816             (ifr->ifr_flags & (IFF_BROADCAST | IFF_POINTTOPOINT)) == 0)
 817                 continue;
 818 .DE
 819 .PP
 820 Once the flags have been obtained, the broadcast address
 821 must be obtained.  In the case of broadcast networks this is
 822 done via the SIOCGIFBRDADDR \fIioctl\fP, while for point-to-point networks
 823 the address of the destination host is obtained with SIOCGIFDSTADDR.
 824 .DS
 825 struct sockaddr dst;
 826
 827 if (ifr->ifr_flags & IFF_POINTTOPOINT) {
 828         if (ioctl(s, SIOCGIFDSTADDR, (char *) ifr) < 0) {
 829                 ...
 830         }
 831         bcopy((char *) ifr->ifr_dstaddr, (char *) &dst, sizeof (ifr->ifr_dstaddr));
 832 } else if (ifr->ifr_flags & IFF_BROADCAST) {
 833         if (ioctl(s, SIOCGIFBRDADDR, (char *) ifr) < 0) {
 834                 ...
 835         }
 836         bcopy((char *) ifr->ifr_broadaddr, (char *) &dst, sizeof (ifr->ifr_broadaddr));
 837 }
 838 .DE
 839 .PP
 840 After the appropriate \fIioctl\fP's have obtained the broadcast
 841 or destination address (now in \fIdst\fP), the \fIsendto\fP call may be
 842 used:
 843 .DS
 844         sendto(s, buf, buflen, 0, (struct sockaddr *)&dst, sizeof (dst));
 845 }
 846 .DE
 847 In the above loop one \fIsendto\fP occurs for every
 848 interface to which the host is connected that supports the notion of
 849 broadcast or point-to-point addressing.
 850 If a process only wished to send broadcast
 851 messages on a given network, code similar to that outlined above
 852 would be used, but the loop would need to find the
 853 correct destination address.
 854 .PP
 855 Received broadcast messages contain the senders address
 856 and port, as datagram sockets are bound before
 857 a message is allowed to go out.
 858 .NH 2
 859 IP Multicasting
 860 .PP
 861 IP multicasting is the transmission of an IP datagram to a "host
 862 group", a set of zero or more hosts identified by a single IP
 863 destination address.  A multicast datagram is delivered to all
 864 members of its destination host group with the same "best-efforts"
 865 reliability as regular unicast IP datagrams, i.e., the datagram is
 866 not guaranteed to arrive intact at all members of the destination
 867 group or in the same order relative to other datagrams.
 868 .PP
 869 The membership of a host group is dynamic; that is, hosts may join
 870 and leave groups at any time.  There is no restriction on the
 871 location or number of members in a host group.  A host may be a
 872 member of more than one group at a time.  A host need not be a member
 873 of a group to send datagrams to it.
 874 .PP
 875 A host group may be permanent or transient.  A permanent group has a
 876 well-known, administratively assigned IP address.  It is the address,
 877 not the membership of the group, that is permanent; at any time a
 878 permanent group may have any number of members, even zero.  Those IP
 879 multicast addresses that are not reserved for permanent groups are
 880 available for dynamic assignment to transient groups which exist only
 881 as long as they have members.
 882 .PP
 883 In general, a host cannot assume that datagrams sent to any host
 884 group address will reach only the intended hosts, or that datagrams
 885 received as a member of a transient host group are intended for the
 886 recipient.  Misdelivery must be detected at a level above IP, using
 887 higher-level identifiers or authentication tokens.  Information
 888 transmitted to a host group address should be encrypted or governed
 889 by administrative routing controls if the sender is concerned about
 890 unwanted listeners.
 891 .PP
 892 IP multicasting is currently supported only on AF_INET sockets of type
 893 SOCK_DGRAM and SOCK_RAW, and only on subnetworks for which the interface
 894 driver has been modified to support multicasting.
 895 .PP
 896 The next subsections describe how to send and receive multicast datagrams.
 897 .NH 3
 898 Sending IP Multicast Datagrams
 899 .PP
 900 To send a multicast datagram, specify an IP multicast address in the range
 901 224.0.0.0 to 239.255.255.255 as the destination address
 902 in a
 903 .IR sendto (2)
 904 call.
 905 .PP
 906 The definitions required for the multicast-related socket options are
 907 found in \fI<netinet/in.h>\fP.
 908 All IP addresses are passed in network byte-order.
 909 .PP
 910 By default, IP multicast datagrams are sent with a time-to-live (TTL) of 1,
 911 which prevents them from being forwarded beyond a single subnetwork.  A new
 912 socket option allows the TTL for subsequent multicast datagrams to be set to
 913 any value from 0 to 255, in order to control the scope of the multicasts:
 914 .DS
 915 u_char ttl;
 916 setsockopt(sock, IPPROTO_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl));
 917 .DE
 918 Multicast datagrams with a TTL of 0 will not be transmitted on any subnet,
 919 but may be delivered locally if the sending host belongs to the destination
 920 group and if multicast loopback has not been disabled on the sending socket
 921 (see below).  Multicast datagrams with TTL greater than one may be delivered
 922 to more than one subnet if there are one or more multicast routers attached
 923 to the first-hop subnet.  To provide meaningful scope control, the multicast
 924 routers support the notion of TTL "thresholds", which prevent datagrams with
 925 less than a certain TTL from traversing certain subnets.  The thresholds
 926 enforce the following convention:
 927 .TS
 928 center;
 929 l | l
 930 l | n.
 931 _
 932 Scope   Initial TTL
 933 =
 934 restricted to the same host     0
 935 restricted to the same subnet   1
 936 restricted to the same site     32
 937 restricted to the same region   64
 938 restricted to the same continent        128
 939 unrestricted    255
 940 _
 941 .TE
 942 "Sites" and "regions" are not strictly defined, and sites may be further
 943 subdivided into smaller administrative units, as a local matter.
 944 .PP
 945 An application may choose an initial TTL other than the ones listed above.
 946 For example, an application might perform an "expanding-ring search" for a
 947 network resource by sending a multicast query, first with a TTL of 0, and
 948 then with larger and larger TTLs, until a reply is received, perhaps using
 949 the TTL sequence 0, 1, 2, 4, 8, 16, 32.
 950 .PP
 951 The multicast router
 952 .IR mrouted (8),
 953 refuses to forward any
 954 multicast datagram with a destination address between 224.0.0.0 and
 955 224.0.0.255, inclusive, regardless of its TTL.  This range of addresses is
 956 reserved for the use of routing protocols and other low-level topology
 957 discovery or maintenance protocols, such as gateway discovery and group
 958 membership reporting.
 959 .PP
 960 The address 224.0.0.0 is
 961 guaranteed not to be assigned to any group, and 224.0.0.1 is assigned
 962 to the permanent group of all IP hosts (including gateways).  This is
 963 used to address all multicast hosts on the directly connected
 964 network.  There is no multicast address (or any other IP address) for
 965 all hosts on the total Internet.  The addresses of other well-known,
 966 permanent groups are published in the "Assigned Numbers" RFC,
 967 which is available from the InterNIC.
 968 .PP
 969 Each multicast transmission is sent from a single network interface, even if
 970 the host has more than one multicast-capable interface.  (If the host is
 971 also serving as a multicast router,
 972 a multicast may be \fIforwarded\fP to interfaces
 973 other than originating interface, provided that the TTL is greater than 1.)
 974 The default interface to be used for multicasting is the primary network
 975 interface on the system.
 976 A socket option
 977 is available to override the default for subsequent transmissions from a
 978 given socket:
 979 .DS
 980 struct in_addr addr;
 981 setsockopt(sock, IPPROTO_IP, IP_MULTICAST_IF, &addr, sizeof(addr));
 982 .DE
 983 where "addr" is the local IP address of the desired outgoing interface.
 984 An address of INADDR_ANY may be used to revert to the default interface.
 985 The local IP address of an interface can be obtained via the SIOCGIFCONF
 986 ioctl.  To determine if an interface supports multicasting, fetch the
 987 interface flags via the SIOCGIFFLAGS ioctl and see if the IFF_MULTICAST
 988 flag is set.  (Normal applications should not need to use this option; it
 989 is intended primarily for multicast routers and other system services
 990 specifically concerned with internet topology.)
 991 The SIOCGIFCONF and SIOCGIFFLAGS ioctls are described in the previous section.
 992 .PP
 993 If a multicast datagram is sent to a group to which the sending host itself
 994 belongs (on the outgoing interface), a copy of the datagram is, by default,
 995 looped back by the IP layer for local delivery.  Another socket option gives
 996 the sender explicit control over whether or not subsequent datagrams are
 997 looped back:
 998 .DS
 999 u_char loop;
1000 setsockopt(sock, IPPROTO_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop));
1001 .DE
1002 where \f2loop\f1 is set to 0 to disable loopback,
1003 and set to 1 to enable loopback.
1004 This option
1005 improves performance for applications that may have no more than one
1006 instance on a single host (such as a router demon), by eliminating
1007 the overhead of receiving their own transmissions.  It should generally not
1008 be used by applications for which there may be more than one instance on a
1009 single host (such as a conferencing program) or for which the sender does
1010 not belong to the destination group (such as a time querying program).
1011 .PP
1012 A multicast datagram sent with an initial TTL greater than 1 may be delivered
1013 to the sending host on a different interface from that on which it was sent,
1014 if the host belongs to the destination group on that other interface.  The
1015 loopback control option has no effect on such delivery.
1016 .NH 3
1017 Receiving IP Multicast Datagrams
1018 .PP
1019 Before a host can receive IP multicast datagrams, it must become a member
1020 of one or more IP multicast groups.  A process can ask the host to join
1021 a multicast group by using the following socket option:
1022 .DS
1023 struct ip_mreq mreq;
1024 setsockopt(sock, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq))
1025 .DE
1026 where "mreq" is the following structure:
1027 .DS
1028 struct ip_mreq {
1029     struct in_addr imr_multiaddr; /* \fImulticast group to join\fP */
1030     struct in_addr imr_interface; /* \fIinterface to join on\fP */
1031 }
1032 .DE
1033 Every membership is associated with a single interface, and it is possible
1034 to join the same group on more than one interface.  "imr_interface" should
1035 be INADDR_ANY to choose the default multicast interface, or one of the
1036 host's local addresses to choose a particular (multicast-capable) interface.
1037 Up to IP_MAX_MEMBERSHIPS (currently 20) memberships may be added on a
1038 single socket.
1039 .PP
1040 To drop a membership, use:
1041 .DS
1042 struct ip_mreq mreq;
1043 setsockopt(sock, IPPROTO_IP, IP_DROP_MEMBERSHIP, &mreq, sizeof(mreq));
1044 .DE
1045 where "mreq" contains the same values as used to add the membership.  The
1046 memberships associated with a socket are also dropped when the socket is
1047 closed or the process holding the socket is killed.  However, more than
1048 one socket may claim a membership in a particular group, and the host
1049 will remain a member of that group until the last claim is dropped.
1050 .PP
1051 The memberships associated with a socket do not necessarily determine which
1052 datagrams are received on that socket.  Incoming multicast packets are
1053 accepted by the kernel IP layer if any socket has claimed a membership in the
1054 destination group of the datagram; however, delivery of a multicast datagram
1055 to a particular socket is based on the destination port (or protocol type, for
1056 raw sockets), just as with unicast datagrams.
1057 To receive multicast datagrams
1058 sent to a particular port, it is necessary to bind to that local port,
1059 leaving the local address unspecified (i.e., INADDR_ANY).
1060 To receive multicast datagrams
1061 sent to a particular group and port, bind to the local port, with
1062 the local address set to the multicast group address.
1063 Once bound to a multicast address, the socket cannot be used for sending data.
1064 .PP
1065 More than one process may bind to the same SOCK_DGRAM UDP port
1066 or the same multicast group and port if the
1067 .I bind
1068 call is preceded by:
1069 .DS
1070 int on = 1;
1071 setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on));
1072 .DE
1073 All processes sharing the port must enable this option.
1074 Every incoming multicast or broadcast UDP datagram destined to
1075 the shared port is delivered to all sockets bound to the port.
1076 For backwards compatibility reasons, this does not apply to incoming
1077 unicast datagrams.  Unicast
1078 datagrams are never delivered to more than one socket, regardless of
1079 how many sockets are bound to the datagram's destination port.
1080 .PP
1081 A final multicast-related extension is independent of IP:  two new ioctls,
1082 SIOCADDMULTI and SIOCDELMULTI, are available to add or delete link-level
1083 (e.g., Ethernet) multicast addresses accepted by a particular interface.
1084 The address to be added or deleted is passed as a sockaddr structure of
1085 family AF_UNSPEC, within the standard ifreq structure.
1086 .PP
1087 These ioctls are
1088 for the use of protocols other than IP, and require superuser privileges.
1089 A link-level multicast address added via SIOCADDMULTI is not automatically
1090 deleted when the socket used to add it goes away; it must be explicitly
1091 deleted.  It is inadvisable to delete a link-level address that may be
1092 in use by IP.
1093 .NH 3
1094 Sample Multicast Program
1095 .PP
1096 The following program sends or receives multicast packets.
1097 If invoked with one argument, it sends a packet containing the current
1098 time to an arbitrarily-chosen multicast group and UDP port.
1099 If invoked with no arguments, it receives and prints these packets.
1100 Start it as a sender on just one host and as a receiver on all the other hosts.
1101 .DS
1102 #include <sys/types.h>
1103 #include <sys/socket.h>
1104 #include <netinet/in.h>
1105 #include <arpa/inet.h>
1106 #include <time.h>
1107 #include <stdio.h>
1108
1109 #define EXAMPLE_PORT    60123
1110 #define EXAMPLE_GROUP   "224.0.0.250"
1111
1112 main(argc)
1113     int argc;
1114 {
1115     struct sockaddr_in addr;
1116     int addrlen, fd, cnt;
1117     struct ip_mreq mreq;
1118     char message[50];
1119
1120     fd = socket(AF_INET, SOCK_DGRAM, 0);
1121     if (fd < 0) {
1122         perror("socket");
1123         exit(1);
1124     }
1125
1126     bzero(&addr, sizeof(addr));
1127     addr.sin_family = AF_INET;
1128     addr.sin_addr.s_addr = htonl(INADDR_ANY);
1129     addr.sin_port = htons(EXAMPLE_PORT);
1130     addrlen = sizeof(addr);
1131
1132     if (argc > 1) {     /* Send */
1133         addr.sin_addr.s_addr = inet_addr(EXAMPLE_GROUP);
1134         while (1) {
1135             time_t t = time(0);
1136             sprintf(message, "time is %-24.24s", ctime(&t));
1137             cnt = sendto(fd, message, sizeof(message), 0,
1138                     (struct sockaddr *)&addr, addrlen);
1139             if (cnt < 0) {
1140                 perror("sendto");
1141                 exit(1);
1142             }
1143             sleep(5);
1144         }
1145     } else {            /* Receive */
1146         if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
1147             perror("bind");
1148             exit(1);
1149         }
1150
1151         mreq.imr_multiaddr.s_addr = inet_addr(EXAMPLE_GROUP);
1152         mreq.imr_interface.s_addr = htonl(INADDR_ANY);
1153         if (setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP,
1154                     &mreq, sizeof(mreq)) < 0) {
1155             perror("setsockopt mreq");
1156             exit(1);
1157         }
1158
1159         while (1) {
1160             cnt = recvfrom(fd, message, sizeof(message), 0,
1161                             (struct sockaddr *)&addr, &addrlen);
1162             if (cnt <= 0) {
1163                     if (cnt == 0) {
1164                         break;
1165                     }
1166                     perror("recvfrom");
1167                     exit(1);
1168             }
1169             printf("%s: message = \e"%s\e"\en",
1170                     inet_ntoa(addr.sin_addr), message);
1171         }
1172     }
1173 }
1174 .DE
1175 .\"----------------------------------------------------------------------
1176 .NH 2
1177 NS Packet Sequences
1178 .PP
1179 The semantics of NS connections demand that
1180 the user both be able to look inside the network header associated
1181 with any incoming packet and be able to specify what should go
1182 in certain fields of an outgoing packet.
1183 Using different calls to \fIsetsockopt\fP, it is possible
1184 to indicate whether prototype headers will be associated by
1185 the user with each outgoing packet (SO_HEADERS_ON_OUTPUT),
1186 to indicate whether the headers received by the system should be
1187 delivered to the user (SO_HEADERS_ON_INPUT), or to indicate
1188 default information that should be associated with all
1189 outgoing packets on a given socket (SO_DEFAULT_HEADERS).
1190 .PP
1191 The contents of a SPP header (minus the IDP header) are:
1192 .DS
1193 .if t .ta \w"  #define"u +\w"  u_short"u +2.0i
1194 struct sphdr {
1195         u_char  sp_cc;          /* connection control */
1196 #define SP_SP   0x80            /* system packet */
1197 #define SP_SA   0x40            /* send acknowledgement */
1198 #define SP_OB   0x20            /* attention (out of band data) */
1199 #define SP_EM   0x10            /* end of message */
1200         u_char  sp_dt;          /* datastream type */
1201         u_short sp_sid;         /* source connection identifier */
1202         u_short sp_did;         /* destination connection identifier */
1203         u_short sp_seq;         /* sequence number */
1204         u_short sp_ack;         /* acknowledge number */
1205         u_short sp_alo;         /* allocation number */
1206 };
1207 .DE
1208 Here, the items of interest are the \fIdatastream type\fP and
1209 the \fIconnection control\fP fields.  The semantics of the
1210 datastream type are defined by the application(s) in question;
1211 the value of this field is, by default, zero, but it can be
1212 used to indicate things such as Xerox's Bulk Data Transfer
1213 Protocol (in which case it is set to one).  The connection control
1214 field is a mask of the flags defined just below it.  The user may
1215 set or clear the end-of-message bit to indicate
1216 that a given message is the last of a given substream type,
1217 or may set/clear the attention bit as an alternate way to
1218 indicate that a packet should be sent out-of-band.
1219 As an example, to associate prototype headers with outgoing
1220 SPP packets, consider:
1221 .DS
1222 #include <sys/types.h>
1223 #include <sys/socket.h>
1224 #include <netns/ns.h>
1225 #include <netns/sp.h>
1226  ...
1227 struct sockaddr_ns sns, to;
1228 int s, on = 1;
1229 struct databuf {
1230         struct sphdr proto_spp; /* prototype header */
1231         char buf[534];          /* max. possible data by Xerox std. */
1232 } buf;
1233  ...
1234 s = socket(AF_NS, SOCK_SEQPACKET, 0);
1235  ...
1236 bind(s, (struct sockaddr *) &sns, sizeof (sns));
1237 setsockopt(s, NSPROTO_SPP, SO_HEADERS_ON_OUTPUT, &on, sizeof(on));
1238  ...
1239 buf.proto_spp.sp_dt = 1;        /* bulk data */
1240 buf.proto_spp.sp_cc = SP_EM;    /* end-of-message */
1241 strcpy(buf.buf, "hello world\en");
1242 sendto(s, (char *) &buf, sizeof(struct sphdr) + strlen("hello world\en"),
1243     (struct sockaddr *) &to, sizeof(to));
1244  ...
1245 .DE
1246 Note that one must be careful when writing headers; if the prototype
1247 header is not written with the data with which it is to be associated,
1248 the kernel will treat the first few bytes of the data as the
1249 header, with unpredictable results.
1250 To turn off the above association, and to indicate that packet
1251 headers received by the system should be passed up to the user,
1252 one might use:
1253 .DS
1254 #include <sys/types.h>
1255 #include <sys/socket.h>
1256 #include <netns/ns.h>
1257 #include <netns/sp.h>
1258  ...
1259 struct sockaddr sns;
1260 int s, on = 1, off = 0;
1261  ...
1262 s = socket(AF_NS, SOCK_SEQPACKET, 0);
1263  ...
1264 bind(s, (struct sockaddr *) &sns, sizeof (sns));
1265 setsockopt(s, NSPROTO_SPP, SO_HEADERS_ON_OUTPUT, &off, sizeof(off));
1266 setsockopt(s, NSPROTO_SPP, SO_HEADERS_ON_INPUT, &on, sizeof(on));
1267  ...
1268 .DE
1269 .PP
1270 Output is handled somewhat differently in the IDP world.
1271 The header of an IDP-level packet looks like:
1272 .DS
1273 .if t .ta \w'struct  'u +\w"  struct ns_addr"u +2.0i
1274 struct idp {
1275         u_short idp_sum;        /* Checksum */
1276         u_short idp_len;        /* Length, in bytes, including header */
1277         u_char  idp_tc;         /* Transport Control (i.e., hop count) */
1278         u_char  idp_pt;         /* Packet Type (i.e., level 2 protocol) */
1279         struct ns_addr  idp_dna;        /* Destination Network Address */
1280         struct ns_addr  idp_sna;        /* Source Network Address */
1281 };
1282 .DE
1283 The primary field of interest in an IDP header is the \fIpacket type\fP
1284 field.  The standard values for this field are (as defined
1285 in <\fInetns/ns.h\fP>):
1286 .DS
1287 .if t .ta \w"  #define"u +\w"  NSPROTO_ERROR"u +1.0i
1288 #define NSPROTO_RI      1               /* Routing Information */
1289 #define NSPROTO_ECHO    2               /* Echo Protocol */
1290 #define NSPROTO_ERROR   3               /* Error Protocol */
1291 #define NSPROTO_PE      4               /* Packet Exchange */
1292 #define NSPROTO_SPP     5               /* Sequenced Packet */
1293 .DE
1294 For SPP connections, the contents of this field are
1295 automatically set to NSPROTO_SPP; for IDP packets,
1296 this value defaults to zero, which means ``unknown''.
1297 .PP
1298 Setting the value of that field with SO_DEFAULT_HEADERS is
1299 easy:
1300 .DS
1301 #include <sys/types.h>
1302 #include <sys/socket.h>
1303 #include <netns/ns.h>
1304 #include <netns/idp.h>
1305  ...
1306 struct sockaddr sns;
1307 struct idp proto_idp;           /* prototype header */
1308 int s, on = 1;
1309  ...
1310 s = socket(AF_NS, SOCK_DGRAM, 0);
1311  ...
1312 bind(s, (struct sockaddr *) &sns, sizeof (sns));
1313 proto_idp.idp_pt = NSPROTO_PE;  /* packet exchange */
1314 setsockopt(s, NSPROTO_IDP, SO_DEFAULT_HEADERS, (char *) &proto_idp,
1315     sizeof(proto_idp));
1316  ...
1317 .DE
1318 .PP
1319 Using SO_HEADERS_ON_OUTPUT is somewhat more difficult.  When
1320 SO_HEADERS_ON_OUTPUT is turned on for an IDP socket, the socket
1321 becomes (for all intents and purposes) a raw socket.  In this
1322 case, all the fields of the prototype header (except the
1323 length and checksum fields, which are computed by the kernel)
1324 must be filled in correctly in order for the socket to send and
1325 receive data in a sensible manner.  To be more specific, the
1326 source address must be set to that of the host sending the
1327 data; the destination address must be set to that of the
1328 host for whom the data is intended; the packet type must be
1329 set to whatever value is desired; and the hopcount must be
1330 set to some reasonable value (almost always zero).  It should
1331 also be noted that simply sending data using \fIwrite\fP
1332 will not work unless a \fIconnect\fP or \fIsendto\fP call
1333 is used, in spite of the fact that it is the destination
1334 address in the prototype header that is used, not the one
1335 given in either of those calls.  For almost
1336 all IDP applications , using SO_DEFAULT_HEADERS is easier and
1337 more desirable than writing headers.
1338 .NH 2
1339 Three-way Handshake
1340 .PP
1341 The semantics of SPP connections indicates that a three-way
1342 handshake, involving changes in the datastream type, should \(em
1343 but is not absolutely required to \(em take place before a SPP
1344 connection is closed.  Almost all SPP connections are
1345 ``well-behaved'' in this manner; when communicating with
1346 any process, it is best to assume that the three-way handshake
1347 is required unless it is known for certain that it is not
1348 required.  In a three-way close, the closing process
1349 indicates that it wishes to close the connection by sending
1350 a zero-length packet with end-of-message set and with
1351 datastream type 254.  The other side of the connection
1352 indicates that it is OK to close by sending a zero-length
1353 packet with end-of-message set and datastream type 255.  Finally,
1354 the closing process replies with a zero-length packet with
1355 substream type 255; at this point, the connection is considered
1356 closed.  The following code fragments are simplified examples
1357 of how one might handle this three-way handshake at the user
1358 level; in the future, support for this type of close will
1359 probably be provided as part of the C library or as part of
1360 the kernel.  The first code fragment below illustrates how a process
1361 might handle three-way handshake if it sees that the process it
1362 is communicating with wants to close the connection:
1363 .DS
1364 #include <sys/types.h>
1365 #include <sys/socket.h>
1366 #include <netns/ns.h>
1367 #include <netns/sp.h>
1368  ...
1369 #ifndef SPPSST_END
1370 #define SPPSST_END 254
1371 #define SPPSST_ENDREPLY 255
1372 #endif
1373 struct sphdr proto_sp;
1374 int s;
1375  ...
1376 read(s, buf, BUFSIZE);
1377 if (((struct sphdr *)buf)->sp_dt == SPPSST_END) {
1378         /*
1379          * SPPSST_END indicates that the other side wants to
1380          * close.
1381          */
1382         proto_sp.sp_dt = SPPSST_ENDREPLY;
1383         proto_sp.sp_cc = SP_EM;
1384         setsockopt(s, NSPROTO_SPP, SO_DEFAULT_HEADERS, (char *)&proto_sp,
1385             sizeof(proto_sp));
1386         write(s, buf, 0);
1387         /*
1388          * Write a zero-length packet with datastream type = SPPSST_ENDREPLY
1389          * to indicate that the close is OK with us.  The packet that we
1390          * don't see (because we don't look for it) is another packet
1391          * from the other side of the connection, with SPPSST_ENDREPLY
1392          * on it it, too.  Once that packet is sent, the connection is
1393          * considered closed; note that we really ought to retransmit
1394          * the close for some time if we do not get a reply.
1395          */
1396         close(s);
1397 }
1398  ...
1399 .DE
1400 To indicate to another process that we would like to close the
1401 connection, the following code would suffice:
1402 .DS
1403 #include <sys/types.h>
1404 #include <sys/socket.h>
1405 #include <netns/ns.h>
1406 #include <netns/sp.h>
1407  ...
1408 #ifndef SPPSST_END
1409 #define SPPSST_END 254
1410 #define SPPSST_ENDREPLY 255
1411 #endif
1412 struct sphdr proto_sp;
1413 int s;
1414  ...
1415 proto_sp.sp_dt = SPPSST_END;
1416 proto_sp.sp_cc = SP_EM;
1417 setsockopt(s, NSPROTO_SPP, SO_DEFAULT_HEADERS, (char *)&proto_sp,
1418     sizeof(proto_sp));
1419 write(s, buf, 0);       /* send the end request */
1420 proto_sp.sp_dt = SPPSST_ENDREPLY;
1421 setsockopt(s, NSPROTO_SPP, SO_DEFAULT_HEADERS, (char *)&proto_sp,
1422     sizeof(proto_sp));
1423 /*
1424  * We assume (perhaps unwisely)
1425  * that the other side will send the
1426  * ENDREPLY, so we'll just send our final ENDREPLY
1427  * as if we'd seen theirs already.
1428  */
1429 write(s, buf, 0);
1430 close(s);
1431  ...
1432 .DE
1433 .NH 2
1434 Packet Exchange
1435 .PP
1436 The Xerox standard protocols include a protocol that is both
1437 reliable and datagram-oriented.  This protocol is known as
1438 Packet Exchange (PEX or PE) and, like SPP, is layered on top
1439 of IDP.  PEX is important for a number of things: Courier
1440 remote procedure calls may be expedited through the use
1441 of PEX, and many Xerox servers are located by doing a PEX
1442 ``BroadcastForServers'' operation.  Although there is no
1443 implementation of PEX in the kernel,
1444 it may be simulated at the user level with some clever coding
1445 and the use of one peculiar \fIgetsockopt\fP.  A PEX packet
1446 looks like:
1447 .DS
1448 .if t .ta \w'struct  'u +\w"  struct idp"u +2.0i
1449 /*
1450  * The packet-exchange header shown here is not defined
1451  * as part of any of the system include files.
1452  */
1453 struct pex {
1454         struct idp      p_idp;  /* idp header */
1455         u_short ph_id[2];       /* unique transaction ID for pex */
1456         u_short ph_client;      /* client type field for pex */
1457 };
1458 .DE
1459 The \fIph_id\fP field is used to hold a ``unique id'' that
1460 is used in duplicate suppression; the \fIph_client\fP
1461 field indicates the PEX client type (similar to the packet
1462 type field in the IDP header).  PEX reliability stems from the
1463 fact that it is an idempotent (``I send a packet to you, you
1464 send a packet to me'') protocol.  Processes on each side of
1465 the connection may use the unique id to determine if they have
1466 seen a given packet before (the unique id field differs on each
1467 packet sent) so that duplicates may be detected, and to indicate
1468 which message a given packet is in response to.  If a packet with
1469 a given unique id is sent and no response is received in a given
1470 amount of time, the packet is retransmitted until it is decided
1471 that no response will ever be received.  To simulate PEX, one
1472 must be able to generate unique ids -- something that is hard to
1473 do at the user level with any real guarantee that the id is really
1474 unique.  Therefore, a means (via \fIgetsockopt\fP) has been provided
1475 for getting unique ids from the kernel.  The following code fragment
1476 indicates how to get a unique id:
1477 .DS
1478 long uniqueid;
1479 int s, idsize = sizeof(uniqueid);
1480  ...
1481 s = socket(AF_NS, SOCK_DGRAM, 0);
1482  ...
1483 /* get id from the kernel -- only on IDP sockets */
1484 getsockopt(s, NSPROTO_PE, SO_SEQNO, (char *)&uniqueid, &idsize);
1485  ...
1486 .DE
1487 The retransmission and duplicate suppression code required to
1488 simulate PEX fully is left as an exercise for the reader.
1489 .NH 2
1490 Inetd
1491 .PP
1492 One of the daemons provided with 4.4BSD is \fIinetd\fP, the
1493 so called ``internet super-server.''
1494 Having one daemon listen for requests for many daemons
1495 instead of having each daemon listen for its own requests
1496 reduces the number of idle daemons and simplies their implementation.
1497 .I Inetd
1498 handles
1499 two types of services: standard and TCPMUX.
1500 A standard service has a well-known port assigned to it and
1501 is listed in
1502 .I /etc/services
1503 (see \f2services\f1(5));
1504 it may be a service that implements an official Internet standard or is a
1505 BSD-specific service.
1506 TCPMUX services are nonstandard and do not have a
1507 well-known port assigned to them.
1508 They are invoked from
1509 .I inetd
1510 when a program connects to the "tcpmux" well-known port and specifies
1511 the service name.
1512 This is useful for adding locally-developed servers.
1513 .PP
1514 \fIInetd\fP is invoked at boot
1515 time, and determines from the file \fI/etc/inetd.conf\fP the
1516 servers for which it is to listen.  Once this information has been
1517 read and a pristine environment created, \fIinetd\fP proceeds
1518 to create one socket for each service it is to listen for,
1519 binding the appropriate port number to each socket.
1520 .PP
1521 \fIInetd\fP then performs a \fIselect\fP on all these
1522 sockets for read availability, waiting for somebody wishing
1523 a connection to the service corresponding to
1524 that socket.  \fIInetd\fP then performs an \fIaccept\fP on
1525 the socket in question, \fIfork\fPs, \fIdup\fPs the new
1526 socket to file descriptors 0 and 1 (stdin and
1527 stdout), closes other open file
1528 descriptors, and \fIexec\fPs the appropriate server.
1529 .PP
1530 Servers making use of \fIinetd\fP are considerably simplified,
1531 as \fIinetd\fP takes care of the majority of the IPC work
1532 required in establishing a connection.  The server invoked
1533 by \fIinetd\fP expects the socket connected to its client
1534 on file descriptors 0 and 1, and may immediately perform
1535 any operations such as \fIread\fP, \fIwrite\fP, \fIsend\fP,
1536 or \fIrecv\fP.  Indeed, servers may use
1537 buffered I/O as provided by the ``stdio'' conventions, as
1538 long as they remember to use \fIfflush\fP when appropriate.
1539 .PP
1540 One call which may be of interest to individuals writing
1541 servers under \fIinetd\fP is the \fIgetpeername\fP call,
1542 which returns the address of the peer (process) connected
1543 on the other end of the socket.  For example, to log the
1544 Internet address in ``dot notation'' (e.g., ``128.32.0.4'')
1545 of a client connected to a server under
1546 \fIinetd\fP, the following code might be used:
1547 .DS
1548 struct sockaddr_in name;
1549 int namelen = sizeof (name);
1550  ...
1551 if (getpeername(0, (struct sockaddr *)&name, &namelen) < 0) {
1552         syslog(LOG_ERR, "getpeername: %m");
1553         exit(1);
1554 } else
1555         syslog(LOG_INFO, "Connection from %s", inet_ntoa(name.sin_addr));
1556  ...
1557 .DE
1558 While the \fIgetpeername\fP call is especially useful when
1559 writing programs to run with \fIinetd\fP, it can be used
1560 under other circumstances.  Be warned, however, that \fIgetpeername\fP will
1561 fail on UNIX domain sockets.
1562 .PP
1563 Standard TCP
1564 services are assigned unique well-known port numbers in the range of
1565 0 to 1023 by the
1566 Internet Assigned Numbers Authority (IANA@ISI.EDU).
1567 The limited number of ports in this range are
1568 assigned to official Internet protocols.
1569 The TCPMUX service allows you to add
1570 locally-developed protocols without needing an official TCP port assignment.
1571 The TCPMUX protocol described in RFC-1078 is simple:
1572 .QP
1573 ``A TCP client connects to a foreign host on TCP port 1.  It sends the
1574 service name followed by a carriage-return line-feed <CRLF>.
1575 The service name is never case sensitive.
1576 The server replies with a
1577 single character indicating positive ("+") or negative ("\-")
1578 acknowledgment, immediately followed by an optional message of
1579 explanation, terminated with a <CRLF>.  If the reply was positive,
1580 the selected protocol begins; otherwise the connection is closed.''
1581 .LP
1582 In 4.4BSD, the TCPMUX service is built into
1583 .IR inetd ,
1584 that is,
1585 .IR inetd
1586 listens on TCP port 1 for requests for TCPMUX services listed
1587 in \f2inetd.conf\f1.
1588 .IR inetd (8)
1589 describes the format of TCPMUX entries for \f2inetd.conf\f1.
1590 .PP
1591 The following is an example TCPMUX server and its \f2inetd.conf\f1 entry.
1592 More sophisticated servers may want to do additional processing
1593 before returning the positive or negative acknowledgement.
1594 .DS
1595 #include <sys/types.h>
1596 #include <stdio.h>
1597
1598 main()
1599 {
1600         time_t t;
1601
1602         printf("+Go\er\en");
1603         fflush(stdout);
1604         time(&t);
1605         printf("%d = %s", t, ctime(&t));
1606         fflush(stdout);
1607 }
1608 .DE
1609 The \f2inetd.conf\f1 entry is:
1610 .DS
1611 tcpmux/current_time stream tcp nowait nobody /d/curtime curtime
1612 .DE
1613 Here's the portion of the client code that handles the TCPMUX handshake:
1614 .DS
1615 char line[BUFSIZ];
1616 FILE *fp;
1617  ...
1618
1619 /* Use stdio for reading data from the server */
1620 fp = fdopen(sock, "r");
1621 if (fp == NULL) {
1622     fprintf(stderr, "Can't create file pointer\en");
1623     exit(1);
1624 }
1625
1626 /* Send service request */
1627 sprintf(line, "%s\er\en", "current_time");
1628 if (write(sock, line, strlen(line)) < 0) {
1629     perror("write");
1630     exit(1);
1631 }
1632
1633 /* Get ACK/NAK response from the server */
1634 if (fgets(line, sizeof(line), fp) == NULL) {
1635     if (feof(fp)) {
1636         die();
1637     } else {
1638         fprintf(stderr, "Error reading response\en");
1639         exit(1);
1640     }
1641 }
1642
1643 /* Delete <CR> */
1644 if ((lp = index(line, '\r')) != NULL) {
1645     *lp = '\0';
1646 }
1647
1648 switch (line[0]) {
1649     case '+':
1650             printf("Got ACK: %s\en", &line[1]);
1651             break;
1652     case '-':
1653             printf("Got NAK: %s\en", &line[1]);
1654             exit(0);
1655     default:
1656             printf("Got unknown response: %s\en", line);
1657             exit(1);
1658 }
1659
1660 /* Get rest of data from the server */
1661 while ((fgets(line, sizeof(line), fp)) != NULL) {
1662     fputs(line, stdout);
1663 }
1664 .DE