2 * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
3 * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
36 #endif /* HAVE_CONFIG_H */
47 * Most distro's headers don't have these yet.
51 #define MADV_DONTFORK 10
55 #define MADV_DOFORK 11
58 #define MADV_DONTFORK INHERIT_NONE
59 #define MADV_DOFORK INHERIT_SHARE
67 struct ibv_mem_node *parent;
68 struct ibv_mem_node *left, *right;
73 static struct ibv_mem_node *mm_root;
74 static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER;
78 int ibv_fork_init(void)
91 page_size = sysconf(_SC_PAGESIZE);
96 if (posix_memalign(&tmp, page_size, page_size))
99 ret = madvise(tmp, page_size, MADV_DONTFORK) ||
100 madvise(tmp, page_size, MADV_DOFORK);
108 mm_root = malloc(sizeof *mm_root);
112 mm_root->parent = NULL;
113 mm_root->left = NULL;
114 mm_root->right = NULL;
115 mm_root->color = IBV_BLACK;
117 mm_root->end = UINTPTR_MAX;
123 static struct ibv_mem_node *__mm_prev(struct ibv_mem_node *node)
130 while (node->parent && node == node->parent->left)
139 static struct ibv_mem_node *__mm_next(struct ibv_mem_node *node)
146 while (node->parent && node == node->parent->right)
155 static void __mm_rotate_right(struct ibv_mem_node *node)
157 struct ibv_mem_node *tmp;
161 node->left = tmp->right;
163 node->left->parent = node;
166 if (node->parent->right == node)
167 node->parent->right = tmp;
169 node->parent->left = tmp;
173 tmp->parent = node->parent;
179 static void __mm_rotate_left(struct ibv_mem_node *node)
181 struct ibv_mem_node *tmp;
185 node->right = tmp->left;
187 node->right->parent = node;
190 if (node->parent->right == node)
191 node->parent->right = tmp;
193 node->parent->left = tmp;
197 tmp->parent = node->parent;
203 static int verify(struct ibv_mem_node *node)
210 hl = verify(node->left);
211 hr = verify(node->left);
218 if (node->color == IBV_RED) {
219 if (node->left && node->left->color != IBV_BLACK)
221 if (node->right && node->right->color != IBV_BLACK)
229 static void __mm_add_rebalance(struct ibv_mem_node *node)
231 struct ibv_mem_node *parent, *gp, *uncle;
233 while (node->parent && node->parent->color == IBV_RED) {
234 parent = node->parent;
235 gp = node->parent->parent;
237 if (parent == gp->left) {
240 if (uncle && uncle->color == IBV_RED) {
241 parent->color = IBV_BLACK;
242 uncle->color = IBV_BLACK;
247 if (node == parent->right) {
248 __mm_rotate_left(parent);
250 parent = node->parent;
253 parent->color = IBV_BLACK;
256 __mm_rotate_right(gp);
261 if (uncle && uncle->color == IBV_RED) {
262 parent->color = IBV_BLACK;
263 uncle->color = IBV_BLACK;
268 if (node == parent->left) {
269 __mm_rotate_right(parent);
271 parent = node->parent;
274 parent->color = IBV_BLACK;
277 __mm_rotate_left(gp);
282 mm_root->color = IBV_BLACK;
285 static void __mm_add(struct ibv_mem_node *new)
287 struct ibv_mem_node *node, *parent = NULL;
292 if (node->start < new->start)
298 if (parent->start < new->start)
303 new->parent = parent;
307 new->color = IBV_RED;
308 __mm_add_rebalance(new);
311 static void __mm_remove(struct ibv_mem_node *node)
313 struct ibv_mem_node *child, *parent, *sib, *tmp;
316 if (node->left && node->right) {
321 nodecol = tmp->color;
323 tmp->color = node->color;
325 if (tmp->parent != node) {
326 parent = tmp->parent;
327 parent->right = tmp->left;
329 tmp->left->parent = parent;
331 tmp->left = node->left;
332 node->left->parent = tmp;
336 tmp->right = node->right;
337 node->right->parent = tmp;
339 tmp->parent = node->parent;
341 if (node->parent->left == node)
342 node->parent->left = tmp;
344 node->parent->right = tmp;
348 nodecol = node->color;
350 child = node->left ? node->left : node->right;
351 parent = node->parent;
354 child->parent = parent;
356 if (parent->left == node)
357 parent->left = child;
359 parent->right = child;
366 if (nodecol == IBV_RED)
369 while ((!child || child->color == IBV_BLACK) && child != mm_root) {
370 if (parent->left == child) {
373 if (sib->color == IBV_RED) {
374 parent->color = IBV_RED;
375 sib->color = IBV_BLACK;
376 __mm_rotate_left(parent);
380 if ((!sib->left || sib->left->color == IBV_BLACK) &&
381 (!sib->right || sib->right->color == IBV_BLACK)) {
382 sib->color = IBV_RED;
384 parent = child->parent;
386 if (!sib->right || sib->right->color == IBV_BLACK) {
388 sib->left->color = IBV_BLACK;
389 sib->color = IBV_RED;
390 __mm_rotate_right(sib);
394 sib->color = parent->color;
395 parent->color = IBV_BLACK;
397 sib->right->color = IBV_BLACK;
398 __mm_rotate_left(parent);
405 if (sib->color == IBV_RED) {
406 parent->color = IBV_RED;
407 sib->color = IBV_BLACK;
408 __mm_rotate_right(parent);
412 if ((!sib->left || sib->left->color == IBV_BLACK) &&
413 (!sib->right || sib->right->color == IBV_BLACK)) {
414 sib->color = IBV_RED;
416 parent = child->parent;
418 if (!sib->left || sib->left->color == IBV_BLACK) {
420 sib->right->color = IBV_BLACK;
421 sib->color = IBV_RED;
422 __mm_rotate_left(sib);
426 sib->color = parent->color;
427 parent->color = IBV_BLACK;
429 sib->left->color = IBV_BLACK;
430 __mm_rotate_right(parent);
438 child->color = IBV_BLACK;
441 static struct ibv_mem_node *__mm_find_start(uintptr_t start, uintptr_t end)
443 struct ibv_mem_node *node = mm_root;
446 if (node->start <= start && node->end >= start)
449 if (node->start < start)
458 static struct ibv_mem_node *merge_ranges(struct ibv_mem_node *node,
459 struct ibv_mem_node *prev)
461 prev->end = node->end;
462 prev->refcnt = node->refcnt;
468 static struct ibv_mem_node *split_range(struct ibv_mem_node *node,
471 struct ibv_mem_node *new_node = NULL;
473 new_node = malloc(sizeof *new_node);
476 new_node->start = cut_line;
477 new_node->end = node->end;
478 new_node->refcnt = node->refcnt;
479 node->end = cut_line - 1;
485 static struct ibv_mem_node *get_start_node(uintptr_t start, uintptr_t end,
488 struct ibv_mem_node *node, *tmp = NULL;
490 node = __mm_find_start(start, end);
491 if (node->start < start)
492 node = split_range(node, start);
494 tmp = __mm_prev(node);
495 if (tmp && tmp->refcnt == node->refcnt + inc)
496 node = merge_ranges(node, tmp);
502 * This function is called if madvise() fails to undo merging/splitting
503 * operations performed on the node.
505 static struct ibv_mem_node *undo_node(struct ibv_mem_node *node,
506 uintptr_t start, int inc)
508 struct ibv_mem_node *tmp = NULL;
511 * This condition can be true only if we merged this
512 * node with the previous one, so we need to split them.
514 if (start > node->start) {
515 tmp = split_range(node, start);
523 tmp = __mm_prev(node);
524 if (tmp && tmp->refcnt == node->refcnt)
525 node = merge_ranges(node, tmp);
527 tmp = __mm_next(node);
528 if (tmp && tmp->refcnt == node->refcnt)
529 node = merge_ranges(tmp, node);
534 static int ibv_madvise_range(void *base, size_t size, int advice)
536 uintptr_t start, end;
537 struct ibv_mem_node *node, *tmp;
539 int rolling_back = 0;
545 start = (uintptr_t) base & ~(page_size - 1);
546 end = ((uintptr_t) (base + size + page_size - 1) &
547 ~(page_size - 1)) - 1;
549 pthread_mutex_lock(&mm_mutex);
551 inc = advice == MADV_DONTFORK ? 1 : -1;
553 node = get_start_node(start, end, inc);
559 while (node && node->start <= end) {
560 if (node->end > end) {
561 if (!split_range(node, end + 1)) {
567 if ((inc == -1 && node->refcnt == 1) ||
568 (inc == 1 && node->refcnt == 0)) {
570 * If this is the first time through the loop,
571 * and we merged this node with the previous
572 * one, then we only want to do the madvise()
573 * on start ... node->end (rather than
574 * starting at node->start).
576 * Otherwise we end up doing madvise() on
577 * bigger region than we're being asked to,
578 * and that may lead to a spurious failure.
580 if (start > node->start)
581 ret = minherit((void *) start, node->end - start + 1,
584 ret = minherit((void *) node->start,
585 node->end - node->start + 1,
588 node = undo_node(node, start, inc);
590 if (rolling_back || !node)
593 /* madvise failed, roll back previous changes */
595 advice = advice == MADV_DONTFORK ?
596 MADV_DOFORK : MADV_DONTFORK;
597 tmp = __mm_prev(node);
598 if (!tmp || start > tmp->end)
606 node = __mm_next(node);
610 tmp = __mm_prev(node);
611 if (tmp && node->refcnt == tmp->refcnt)
612 node = merge_ranges(node, tmp);
619 pthread_mutex_unlock(&mm_mutex);
624 int ibv_dontfork_range(void *base, size_t size)
627 return ibv_madvise_range(base, size, MADV_DONTFORK);
634 int ibv_dofork_range(void *base, size_t size)
637 return ibv_madvise_range(base, size, MADV_DOFORK);