1 .\" Copyright (c) 2000 FreeBSD Inc.
2 .\" All rights reserved.
4 .\" Redistribution and use in source and binary forms, with or without
5 .\" modification, are permitted provided that the following conditions
7 .\" 1. Redistributions of source code must retain the above copyright
8 .\" notice, this list of conditions and the following disclaimer.
9 .\" 2. Redistributions in binary form must reproduce the above copyright
10 .\" notice, this list of conditions and the following disclaimer in the
11 .\" documentation and/or other materials provided with the distribution.
13 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 .\" ARE DISCLAIMED. IN NO EVENT SHALL [your name] OR CONTRIBUTORS BE LIABLE
17 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 .Nd "memory management in the kernel IPC subsystem"
40 .Ss Mbuf allocation macros
41 .Fn MGET "struct mbuf *mbuf" "int how" "short type"
42 .Fn MGETHDR "struct mbuf *mbuf" "int how" "short type"
44 .Fn MCLGET "struct mbuf *mbuf" "int how"
46 .Fa "struct mbuf *mbuf"
49 .Fa "void (*free)(void *opt_arg1, void *opt_arg2)"
56 .Ss Mbuf utility macros
57 .Fn mtod "struct mbuf *mbuf" "type"
58 .Fn M_ALIGN "struct mbuf *mbuf" "u_int len"
59 .Fn MH_ALIGN "struct mbuf *mbuf" "u_int len"
61 .Fn M_LEADINGSPACE "struct mbuf *mbuf"
63 .Fn M_TRAILINGSPACE "struct mbuf *mbuf"
64 .Fn M_MOVE_PKTHDR "struct mbuf *to" "struct mbuf *from"
65 .Fn M_PREPEND "struct mbuf *mbuf" "int len" "int how"
66 .Fn MCHTYPE "struct mbuf *mbuf" "short type"
68 .Fn M_WRITABLE "struct mbuf *mbuf"
70 .Ss Mbuf allocation functions
72 .Fn m_get "int how" "short type"
74 .Fn m_get2 "int size" "int how" "short type" "int flags"
76 .Fn m_getm "struct mbuf *orig" "int len" "int how" "short type"
78 .Fn m_getjcl "int how" "short type" "int flags" "int size"
80 .Fn m_getcl "int how" "short type" "int flags"
82 .Fn m_getclr "int how" "short type"
84 .Fn m_gethdr "int how" "short type"
86 .Fn m_free "struct mbuf *mbuf"
88 .Fn m_freem "struct mbuf *mbuf"
90 .Ss Mbuf utility functions
92 .Fn m_adj "struct mbuf *mbuf" "int len"
94 .Fn m_align "struct mbuf *mbuf" "int len"
96 .Fn m_append "struct mbuf *mbuf" "int len" "c_caddr_t cp"
98 .Fn m_prepend "struct mbuf *mbuf" "int len" "int how"
100 .Fn m_copyup "struct mbuf *mbuf" "int len" "int dstoff"
102 .Fn m_pullup "struct mbuf *mbuf" "int len"
104 .Fn m_pulldown "struct mbuf *mbuf" "int offset" "int len" "int *offsetp"
106 .Fn m_copym "struct mbuf *mbuf" "int offset" "int len" "int how"
108 .Fn m_copypacket "struct mbuf *mbuf" "int how"
110 .Fn m_dup "struct mbuf *mbuf" "int how"
112 .Fn m_copydata "const struct mbuf *mbuf" "int offset" "int len" "caddr_t buf"
114 .Fn m_copyback "struct mbuf *mbuf" "int offset" "int len" "caddr_t buf"
120 .Fa "struct ifnet *ifp"
121 .Fa "void (*copy)(char *from, caddr_t to, u_int len)"
124 .Fn m_cat "struct mbuf *m" "struct mbuf *n"
126 .Fn m_catpkt "struct mbuf *m" "struct mbuf *n"
128 .Fn m_fixhdr "struct mbuf *mbuf"
130 .Fn m_dup_pkthdr "struct mbuf *to" "struct mbuf *from"
132 .Fn m_move_pkthdr "struct mbuf *to" "struct mbuf *from"
134 .Fn m_length "struct mbuf *mbuf" "struct mbuf **last"
136 .Fn m_split "struct mbuf *mbuf" "int len" "int how"
138 .Fn m_apply "struct mbuf *mbuf" "int off" "int len" "int (*f)(void *arg, void *data, u_int len)" "void *arg"
140 .Fn m_getptr "struct mbuf *mbuf" "int loc" "int *off"
142 .Fn m_defrag "struct mbuf *m0" "int how"
144 .Fn m_collapse "struct mbuf *m0" "int how" "int maxfrags"
146 .Fn m_unshare "struct mbuf *m0" "int how"
151 is a basic unit of memory management in the kernel IPC subsystem.
152 Network packets and socket buffers are stored in
154 A network packet may span multiple
159 which allows adding or trimming
160 network headers with little overhead.
162 While a developer should not bother with
164 internals without serious
165 reason in order to avoid incompatibilities with future changes, it
166 is useful to understand the general structure of an
171 consists of a variable-sized header and a small internal
176 is a constant defined in
181 .Bl -tag -width "m_nextpkt" -offset indent
184 A pointer to the next
190 A pointer to the next
195 A pointer to data attached to this
199 The length of the data.
202 The type of the data.
212 flag bits are defined as follows:
215 #define M_EXT 0x00000001 /* has associated external storage */
216 #define M_PKTHDR 0x00000002 /* start of record */
217 #define M_EOR 0x00000004 /* end of record */
218 #define M_RDONLY 0x00000008 /* associated data marked read-only */
219 #define M_PROTO1 0x00001000 /* protocol-specific */
220 #define M_PROTO2 0x00002000 /* protocol-specific */
221 #define M_PROTO3 0x00004000 /* protocol-specific */
222 #define M_PROTO4 0x00008000 /* protocol-specific */
223 #define M_PROTO5 0x00010000 /* protocol-specific */
224 #define M_PROTO6 0x00020000 /* protocol-specific */
225 #define M_PROTO7 0x00040000 /* protocol-specific */
226 #define M_PROTO8 0x00080000 /* protocol-specific */
227 #define M_PROTO9 0x00100000 /* protocol-specific */
228 #define M_PROTO10 0x00200000 /* protocol-specific */
229 #define M_PROTO11 0x00400000 /* protocol-specific */
230 #define M_PROTO12 0x00800000 /* protocol-specific */
232 /* mbuf pkthdr flags (also stored in m_flags) */
233 #define M_BCAST 0x00000010 /* send/received as link-level broadcast */
234 #define M_MCAST 0x00000020 /* send/received as link-level multicast */
239 types are defined as follows:
242 #define MT_DATA 1 /* dynamic (data) allocation */
243 #define MT_HEADER MT_DATA /* packet header */
244 #define MT_SONAME 8 /* socket name */
245 #define MT_CONTROL 14 /* extra-data protocol message */
246 #define MT_OOBDATA 15 /* expedited data */
249 The available external buffer types are defined as follows:
251 /* external buffer types */
252 #define EXT_CLUSTER 1 /* mbuf cluster */
253 #define EXT_SFBUF 2 /* sendfile(2)'s sf_bufs */
254 #define EXT_JUMBOP 3 /* jumbo cluster 4096 bytes */
255 #define EXT_JUMBO9 4 /* jumbo cluster 9216 bytes */
256 #define EXT_JUMBO16 5 /* jumbo cluster 16184 bytes */
257 #define EXT_PACKET 6 /* mbuf+cluster from packet zone */
258 #define EXT_MBUF 7 /* external mbuf reference */
259 #define EXT_NET_DRV 252 /* custom ext_buf provided by net driver(s) */
260 #define EXT_MOD_TYPE 253 /* custom module's ext_buf type */
261 #define EXT_DISPOSABLE 254 /* can throw this buffer away w/page flipping */
262 #define EXT_EXTREF 255 /* has externally maintained ref_cnt ptr */
268 .Vt struct pkthdr Va m_pkthdr
272 It contains a pointer to the interface
273 the packet has been received from
274 .Pq Vt struct ifnet Va *rcvif ,
275 and the total packet length
277 Optionally, it may also contain an attached list of packet tags
278 .Pq Vt "struct m_tag" .
282 Fields used in offloading checksum calculation to the hardware are kept in
286 .Sx HARDWARE-ASSISTED CHECKSUM CALCULATION
289 If small enough, data is stored in the internal data buffer of an
291 If the data is sufficiently large, another
295 or external storage may be associated with the
298 bytes of data can fit into an
306 If external storage is being associated with an
310 header is added at the cost of losing the internal data buffer.
311 It includes a pointer to external storage, the size of the storage,
312 a pointer to a function used for freeing the storage,
313 a pointer to an optional argument that can be passed to the function,
314 and a pointer to a reference counter.
317 using external storage has the
321 The system supplies a macro for allocating the desired external storage
325 The allocation and management of the reference counter is handled by the
328 The system also supplies a default type of external storage buffer called an
331 can be allocated and configured with the use of the
338 in size, where MCLBYTES is a machine-dependent constant.
339 The system defines an advisory macro
341 which is the smallest amount of data to put into an
346 It is typically preferable to store data into the data region of an
348 if size permits, as opposed to allocating a separate
350 to hold the same data.
352 .Ss Macros and Functions
353 There are numerous predefined macros and functions that provide the
354 developer with common utilities.
356 .Bl -ohang -offset indent
357 .It Fn mtod mbuf type
360 pointer to a data pointer.
361 The macro expands to the data pointer cast to the specified
364 It is advisable to ensure that there is enough contiguous data in
369 .It Fn MGET mbuf how type
372 and initialize it to contain internal data.
374 will point to the allocated
376 on success, or be set to
381 argument is to be set to
385 It specifies whether the caller is willing to block if necessary.
386 A number of other functions and macros related to
388 have the same argument because they may
389 at some point need to allocate new
391 .It Fn MGETHDR mbuf how type
394 and initialize it to contain a packet header
399 .It Fn MEXTADD mbuf buf size free opt_arg1 opt_arg2 flags type
400 Associate externally managed data with
402 Any internal data contained in the mbuf will be discarded, and the
409 arguments are the address and length, respectively, of the data.
412 argument points to a function which will be called to free the data
413 when the mbuf is freed; it is only used if
421 arguments will be passed unmodified to
425 argument specifies additional
427 flags; it is not necessary to specify
431 argument specifies the type of external data, which controls how it
432 will be disposed of when the
435 In most cases, the correct value is
437 .It Fn MCLGET mbuf how
438 Allocate and attach an
442 On success, a non-zero value returned; otherwise, 0.
443 Historically, consumers would check for success by testing the
445 flag on the mbuf, but this is now discouraged to avoid unnecessary awareness
446 of the implementation of external storage in protocol stacks and device
448 .It Fn M_ALIGN mbuf len
451 to place an object of the size
453 at the end of the internal data area of
458 is newly allocated with
462 .It Fn MH_ALIGN mbuf len
463 Serves the same purpose as
475 .It Fn m_align mbuf len
476 Services the same purpose as
478 but handles any type of mbuf.
479 .It Fn M_LEADINGSPACE mbuf
480 Returns the number of bytes available before the beginning
483 .It Fn M_TRAILINGSPACE mbuf
484 Returns the number of bytes available after the end of data in
486 .It Fn M_PREPEND mbuf len how
487 This macro operates on an
489 It is an optimized wrapper for
491 that can make use of possible empty space before data
492 (e.g.\& left after trimming of a link-layer header).
500 .It Fn M_MOVE_PKTHDR to from
501 Using this macro is equivalent to calling
502 .Fn m_move_pkthdr to from .
503 .It Fn M_WRITABLE mbuf
504 This macro will evaluate true if
510 does not contain external storage or,
512 then if the reference count of the storage is not greater than 1.
517 This can be achieved during setup of the external storage,
524 macro, or can be directly set in individual
526 .It Fn MCHTYPE mbuf type
531 This is a relatively expensive operation and should be avoided.
535 .Bl -ohang -offset indent
536 .It Fn m_get how type
537 A function version of
539 for non-critical paths.
540 .It Fn m_get2 size how type flags
543 with enough space to hold specified amount of data.
544 .It Fn m_getm orig len how type
551 if necessary and append the resulting allocated
557 .No non- Ns Dv NULL .
558 If the allocation fails at any point,
559 free whatever was allocated and return
564 .No non- Ns Dv NULL ,
565 it will not be freed.
566 It is possible to use
574 (for example, one which may be sitting in a pre-allocated ring)
575 or to simply perform an all-or-nothing
580 .It Fn m_gethdr how type
581 A function version of
583 for non-critical paths.
584 .It Fn m_getcl how type flags
590 If one of the allocations fails, the entire allocation fails.
591 This routine is the preferred way of fetching both the
595 together, as it avoids having to unlock/relock between allocations.
599 .It Fn m_getjcl how type flags size
602 but it the size of the cluster allocated will be large enough for
605 .It Fn m_getclr how type
608 and zero out the data region.
618 The functions below operate on
620 .Bl -ohang -offset indent
624 including any external storage.
626 .It Fn m_adj mbuf len
629 bytes from the head of an
633 is positive, from the tail otherwise.
635 .It Fn m_append mbuf len cp
642 Extend the mbuf chain if the new data does not fit in
645 .It Fn m_prepend mbuf len how
648 and prepend it to the
654 It does not allocate any
666 .It Fn m_copyup mbuf len dstoff
671 bytes of data into a new mbuf at
676 argument aligns the data and leaves room for a link layer header.
686 The function does not allocate
693 .It Fn m_pullup mbuf len
694 Arrange that the first
698 are contiguous and lay in the data area of
700 so they are accessible with
702 It is important to remember that this may involve
703 reallocating some mbufs and moving data so all pointers
704 referencing data within the old mbuf chain
705 must be recalculated or made invalid.
713 is freed in this case).
715 It does not allocate any
719 must be less than or equal to
722 .It Fn m_pulldown mbuf offset len offsetp
731 are contiguous and lay in the data area of
733 so they are accessible with
736 must be smaller than, or equal to, the size of an
738 Return a pointer to an intermediate
740 in the chain containing the requested region;
741 the offset in the data region of the
743 to the data contained in the returned mbuf is stored in
747 is NULL, the region may be accessed using
751 is non-NULL, the region may be accessed using
752 .Fn mtod mbuf uint8_t
754 The region of the mbuf chain between its beginning and
756 is not modified, therefore it is safe to hold pointers to data within
757 this region before calling
760 .It Fn m_copym mbuf offset len how
765 bytes from the beginning, continuing for
772 copy to the end of the
775 The copy is read-only, because the
777 are not copied, only their reference counts are incremented.
779 .It Fn m_copypacket mbuf how
780 Copy an entire packet including header, which must be present.
781 This is an optimized version of the common case
782 .Fn m_copym mbuf 0 M_COPYALL how .
784 the copy is read-only, because the
786 are not copied, only their reference counts are incremented.
788 .It Fn m_dup mbuf how
791 into a completely new
793 including copying any
797 when you need a writable copy of an
800 .It Fn m_copydata mbuf offset len buf
805 bytes from the beginning, continuing for
807 bytes, into the indicated buffer
810 .It Fn m_copyback mbuf offset len buf
813 bytes from the buffer
815 back into the indicated
819 bytes from the beginning of the
825 It does not allocate any
837 will be allocated to fill the space.
839 .It Fn m_length mbuf last
840 Return the length of the
842 and optionally a pointer to the last
845 .It Fn m_dup_pkthdr to from how
846 Upon the function's completion, the
849 will contain an identical copy of
851 and the per-packet attributes found in the
861 must be empty on entry.
863 .It Fn m_move_pkthdr to from
866 and the per-packet attributes from the
879 must be empty on entry.
880 Upon the function's completion,
884 and the per-packet attributes cleared.
887 Set the packet-header length to the length of the
890 .It Fn m_devget buf len offset ifp copy
891 Copy data from a device local memory pointed to by
895 The copy is done using a specified copy routine
911 must be of the same type.
913 is not guaranteed to be valid after
917 does not update any packet header fields or free mbuf tags.
922 that operates on packets.
927 must contain packet headers.
929 is not guaranteed to be valid after
933 .It Fn m_split mbuf len how
936 in two pieces, returning the tail:
940 In case of failure, it returns
942 and attempts to restore the
944 to its original state.
946 .It Fn m_apply mbuf off len f arg
947 Apply a function to an
954 Typically used to avoid calls to
956 which would otherwise be unnecessary or undesirable.
958 is a convenience argument which is passed to the callback function
963 is called, it will be passed
967 in the current mbuf, and the length
969 of the data in this mbuf to which the function should be applied.
971 The function should return zero to indicate success;
972 otherwise, if an error is indicated, then
974 will return the error and stop iterating through the
977 .It Fn m_getptr mbuf loc off
978 Return a pointer to the mbuf containing the data located at
980 bytes from the beginning of the
982 The corresponding offset into the mbuf will be stored in
984 .It Fn m_defrag m0 how
985 Defragment an mbuf chain, returning the shortest possible
986 chain of mbufs and clusters.
987 If allocation fails and this can not be completed,
989 will be returned and the original chain will be unchanged.
990 Upon success, the original chain will be freed and the new
991 chain will be returned.
997 depending on the caller's preference.
999 This function is especially useful in network drivers, where
1000 certain long mbuf chains must be shortened before being added
1001 to TX descriptor lists.
1002 .It Fn m_collapse m0 how maxfrags
1003 Defragment an mbuf chain, returning a chain of at most
1006 If allocation fails or the chain cannot be collapsed as requested,
1008 will be returned, with the original chain possibly modified.
1016 .It Fn m_unshare m0 how
1017 Create a version of the specified mbuf chain whose
1018 contents can be safely modified without affecting other users.
1019 If allocation fails and this operation can not be completed,
1022 The original mbuf chain is always reclaimed and the reference
1023 count of any shared mbuf clusters is decremented.
1029 depending on the caller's preference.
1030 As a side-effect of this process the returned
1031 mbuf chain may be compacted.
1033 This function is especially useful in the transmit path of
1034 network code, when data must be encrypted or otherwise
1035 altered prior to transmission.
1037 .Sh HARDWARE-ASSISTED CHECKSUM CALCULATION
1038 This section currently applies to TCP/IP only.
1039 In order to save the host CPU resources, computing checksums is
1040 offloaded to the network interface hardware if possible.
1043 member of the leading
1045 of a packet contains two fields used for that purpose,
1046 .Vt int Va csum_flags
1048 .Vt int Va csum_data .
1049 The meaning of those fields depends on the direction a packet flows in,
1050 and on whether the packet is fragmented.
1056 will denote the corresponding field of the
1058 member of the leading
1062 containing the packet.
1064 On output, checksum offloading is attempted after the outgoing
1065 interface has been determined for a packet.
1066 The interface-specific field
1067 .Va ifnet.if_data.ifi_hwassist
1070 is consulted for the capabilities of the interface to assist in
1071 computing checksums.
1074 field of the packet header is set to indicate which actions the interface
1075 is supposed to perform on it.
1076 The actions unsupported by the network interface are done in the
1077 software prior to passing the packet down to the interface driver;
1078 such actions will never be requested through
1081 The flags demanding a particular action from an interface are as follows:
1082 .Bl -tag -width ".Dv CSUM_TCP" -offset indent
1084 The IP header checksum is to be computed and stored in the
1085 corresponding field of the packet.
1086 The hardware is expected to know the format of an IP header
1087 to determine the offset of the IP checksum field.
1089 The TCP checksum is to be computed.
1092 The UDP checksum is to be computed.
1096 Should a TCP or UDP checksum be offloaded to the hardware,
1099 will contain the byte offset of the checksum field relative to the
1100 end of the IP header.
1101 In this case, the checksum field will be initially
1102 set by the TCP/IP module to the checksum of the pseudo header
1103 defined by the TCP and UDP specifications.
1105 On input, an interface indicates the actions it has performed
1106 on a packet by setting one or more of the following flags in
1108 associated with the packet:
1109 .Bl -tag -width ".Dv CSUM_IP_CHECKED" -offset indent
1110 .It Dv CSUM_IP_CHECKED
1111 The IP header checksum has been computed.
1112 .It Dv CSUM_IP_VALID
1113 The IP header has a valid checksum.
1114 This flag can appear only in combination with
1115 .Dv CSUM_IP_CHECKED .
1116 .It Dv CSUM_DATA_VALID
1117 The checksum of the data portion of the IP packet has been computed
1118 and stored in the field
1120 in network byte order.
1121 .It Dv CSUM_PSEUDO_HDR
1122 Can be set only along with
1124 to indicate that the IP data checksum found in
1126 allows for the pseudo header defined by the TCP and UDP specifications.
1127 Otherwise the checksum of the pseudo header must be calculated by
1128 the host CPU and added to
1130 to obtain the final checksum to be used for TCP or UDP validation purposes.
1133 If a particular network interface just indicates success or
1134 failure of TCP or UDP checksum validation without returning
1135 the exact value of the checksum to the host CPU, its driver can mark
1145 hexadecimal to indicate a valid checksum.
1146 It is a peculiarity of the algorithm used that the Internet checksum
1147 calculated over any valid packet will be
1149 as long as the original checksum field is included.
1151 When running a kernel compiled with the option
1152 .Dv MBUF_STRESS_TEST ,
1155 -controlled options may be used to create
1156 various failure/extreme cases for testing of network drivers
1157 and other parts of the kernel that rely on
1159 .Bl -tag -width ident
1160 .It Va net.inet.ip.mbuf_frag_size
1163 to fragment outgoing
1165 into fragments of the specified size.
1166 Setting this variable to 1 is an excellent way to
1169 handling ability of network drivers.
1170 .It Va kern.ipc.m_defragrandomfailures
1173 to randomly fail, returning
1175 Any piece of code which uses
1177 should be tested with this feature.
1185 .\" Please correct me if I'm wrong
1187 appeared in an early version of
1189 Besides being used for network packets, they were used
1190 to store various dynamic structures, such as routing table
1191 entries, interface addresses, protocol control blocks, etc.
1196 is almost entirely limited to packet storage, with
1198 zones being used directly to store other network-related memory.
1202 allocator has been a special-purpose memory allocator able to run in
1203 interrupt contexts and allocating from a special kernel address space map.
1208 allocator is a wrapper around
1214 + cluster pairs in per-CPU caches, as well as bringing other benefits of
1219 manual page was written by
1224 allocator was written by