]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/ofed/libibverbs/memory.c
Merge clang trunk r351319, resolve conflicts, and update FREEBSD-Xlist.
[FreeBSD/FreeBSD.git] / contrib / ofed / libibverbs / memory.c
1 /*
2  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
3  * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33
34 #include <config.h>
35
36 #include <errno.h>
37 #include <sys/mman.h>
38 #include <unistd.h>
39 #include <stdlib.h>
40 #include <stdint.h>
41 #include <stdio.h>
42 #include <string.h>
43 #include <dirent.h>
44 #include <limits.h>
45 #include <inttypes.h>
46
47 #include "ibverbs.h"
48
49 struct ibv_mem_node {
50         enum {
51                 IBV_RED,
52                 IBV_BLACK
53         }                       color;
54         struct ibv_mem_node    *parent;
55         struct ibv_mem_node    *left, *right;
56         uintptr_t               start, end;
57         int                     refcnt;
58 };
59
60 static struct ibv_mem_node *mm_root;
61 static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER;
62 static int page_size;
63 static int huge_page_enabled;
64 static int too_late;
65
66 static unsigned long smaps_page_size(FILE *file)
67 {
68         int n;
69         unsigned long size = page_size;
70         char buf[1024];
71
72         while (fgets(buf, sizeof(buf), file) != NULL) {
73                 if (!strstr(buf, "KernelPageSize:"))
74                         continue;
75
76                 n = sscanf(buf, "%*s %lu", &size);
77                 if (n < 1)
78                         continue;
79
80                 /* page size is printed in Kb */
81                 size = size * 1024;
82
83                 break;
84         }
85
86         return size;
87 }
88
89 static unsigned long get_page_size(void *base)
90 {
91         unsigned long ret = page_size;
92         pid_t pid;
93         FILE *file;
94         char buf[1024];
95
96         pid = getpid();
97         snprintf(buf, sizeof(buf), "/proc/%d/smaps", pid);
98
99         file = fopen(buf, "r" STREAM_CLOEXEC);
100         if (!file)
101                 goto out;
102
103         while (fgets(buf, sizeof(buf), file) != NULL) {
104                 int n;
105                 uintptr_t range_start, range_end;
106
107                 n = sscanf(buf, "%" SCNxPTR "-%" SCNxPTR, &range_start, &range_end);
108
109                 if (n < 2)
110                         continue;
111
112                 if ((uintptr_t) base >= range_start && (uintptr_t) base < range_end) {
113                         ret = smaps_page_size(file);
114                         break;
115                 }
116         }
117
118         fclose(file);
119
120 out:
121         return ret;
122 }
123
124 int ibv_fork_init(void)
125 {
126         void *tmp, *tmp_aligned;
127         int ret;
128         unsigned long size;
129
130         if (getenv("RDMAV_HUGEPAGES_SAFE"))
131                 huge_page_enabled = 1;
132
133         if (mm_root)
134                 return 0;
135
136         if (too_late)
137                 return EINVAL;
138
139         page_size = sysconf(_SC_PAGESIZE);
140         if (page_size < 0)
141                 return errno;
142
143         if (posix_memalign(&tmp, page_size, page_size))
144                 return ENOMEM;
145
146         if (huge_page_enabled) {
147                 size = get_page_size(tmp);
148                 tmp_aligned = (void *) ((uintptr_t) tmp & ~(size - 1));
149         } else {
150                 size = page_size;
151                 tmp_aligned = tmp;
152         }
153
154         ret = madvise(tmp_aligned, size, MADV_DONTFORK) ||
155               madvise(tmp_aligned, size, MADV_DOFORK);
156
157         free(tmp);
158
159         if (ret)
160                 return ENOSYS;
161
162         mm_root = malloc(sizeof *mm_root);
163         if (!mm_root)
164                 return ENOMEM;
165
166         mm_root->parent = NULL;
167         mm_root->left   = NULL;
168         mm_root->right  = NULL;
169         mm_root->color  = IBV_BLACK;
170         mm_root->start  = 0;
171         mm_root->end    = UINTPTR_MAX;
172         mm_root->refcnt = 0;
173
174         return 0;
175 }
176
177 static struct ibv_mem_node *__mm_prev(struct ibv_mem_node *node)
178 {
179         if (node->left) {
180                 node = node->left;
181                 while (node->right)
182                         node = node->right;
183         } else {
184                 while (node->parent && node == node->parent->left)
185                         node = node->parent;
186
187                 node = node->parent;
188         }
189
190         return node;
191 }
192
193 static struct ibv_mem_node *__mm_next(struct ibv_mem_node *node)
194 {
195         if (node->right) {
196                 node = node->right;
197                 while (node->left)
198                         node = node->left;
199         } else {
200                 while (node->parent && node == node->parent->right)
201                         node = node->parent;
202
203                 node = node->parent;
204         }
205
206         return node;
207 }
208
209 static void __mm_rotate_right(struct ibv_mem_node *node)
210 {
211         struct ibv_mem_node *tmp;
212
213         tmp = node->left;
214
215         node->left = tmp->right;
216         if (node->left)
217                 node->left->parent = node;
218
219         if (node->parent) {
220                 if (node->parent->right == node)
221                         node->parent->right = tmp;
222                 else
223                         node->parent->left = tmp;
224         } else
225                 mm_root = tmp;
226
227         tmp->parent = node->parent;
228
229         tmp->right = node;
230         node->parent = tmp;
231 }
232
233 static void __mm_rotate_left(struct ibv_mem_node *node)
234 {
235         struct ibv_mem_node *tmp;
236
237         tmp = node->right;
238
239         node->right = tmp->left;
240         if (node->right)
241                 node->right->parent = node;
242
243         if (node->parent) {
244                 if (node->parent->right == node)
245                         node->parent->right = tmp;
246                 else
247                         node->parent->left = tmp;
248         } else
249                 mm_root = tmp;
250
251         tmp->parent = node->parent;
252
253         tmp->left = node;
254         node->parent = tmp;
255 }
256
257 #if 0
258 static int verify(struct ibv_mem_node *node)
259 {
260         int hl, hr;
261
262         if (!node)
263                 return 1;
264
265         hl = verify(node->left);
266         hr = verify(node->left);
267
268         if (!hl || !hr)
269                 return 0;
270         if (hl != hr)
271                 return 0;
272
273         if (node->color == IBV_RED) {
274                 if (node->left && node->left->color != IBV_BLACK)
275                         return 0;
276                 if (node->right && node->right->color != IBV_BLACK)
277                         return 0;
278                 return hl;
279         }
280
281         return hl + 1;
282 }
283 #endif
284
285 static void __mm_add_rebalance(struct ibv_mem_node *node)
286 {
287         struct ibv_mem_node *parent, *gp, *uncle;
288
289         while (node->parent && node->parent->color == IBV_RED) {
290                 parent = node->parent;
291                 gp     = node->parent->parent;
292
293                 if (parent == gp->left) {
294                         uncle = gp->right;
295
296                         if (uncle && uncle->color == IBV_RED) {
297                                 parent->color = IBV_BLACK;
298                                 uncle->color  = IBV_BLACK;
299                                 gp->color     = IBV_RED;
300
301                                 node = gp;
302                         } else {
303                                 if (node == parent->right) {
304                                         __mm_rotate_left(parent);
305                                         node   = parent;
306                                         parent = node->parent;
307                                 }
308
309                                 parent->color = IBV_BLACK;
310                                 gp->color     = IBV_RED;
311
312                                 __mm_rotate_right(gp);
313                         }
314                 } else {
315                         uncle = gp->left;
316
317                         if (uncle && uncle->color == IBV_RED) {
318                                 parent->color = IBV_BLACK;
319                                 uncle->color  = IBV_BLACK;
320                                 gp->color     = IBV_RED;
321
322                                 node = gp;
323                         } else {
324                                 if (node == parent->left) {
325                                         __mm_rotate_right(parent);
326                                         node   = parent;
327                                         parent = node->parent;
328                                 }
329
330                                 parent->color = IBV_BLACK;
331                                 gp->color     = IBV_RED;
332
333                                 __mm_rotate_left(gp);
334                         }
335                 }
336         }
337
338         mm_root->color = IBV_BLACK;
339 }
340
341 static void __mm_add(struct ibv_mem_node *new)
342 {
343         struct ibv_mem_node *node, *parent = NULL;
344
345         node = mm_root;
346         while (node) {
347                 parent = node;
348                 if (node->start < new->start)
349                         node = node->right;
350                 else
351                         node = node->left;
352         }
353
354         if (parent->start < new->start)
355                 parent->right = new;
356         else
357                 parent->left = new;
358
359         new->parent = parent;
360         new->left   = NULL;
361         new->right  = NULL;
362
363         new->color = IBV_RED;
364         __mm_add_rebalance(new);
365 }
366
367 static void __mm_remove(struct ibv_mem_node *node)
368 {
369         struct ibv_mem_node *child, *parent, *sib, *tmp;
370         int nodecol;
371
372         if (node->left && node->right) {
373                 tmp = node->left;
374                 while (tmp->right)
375                         tmp = tmp->right;
376
377                 nodecol    = tmp->color;
378                 child      = tmp->left;
379                 tmp->color = node->color;
380
381                 if (tmp->parent != node) {
382                         parent        = tmp->parent;
383                         parent->right = tmp->left;
384                         if (tmp->left)
385                                 tmp->left->parent = parent;
386
387                         tmp->left          = node->left;
388                         node->left->parent = tmp;
389                 } else
390                         parent = tmp;
391
392                 tmp->right          = node->right;
393                 node->right->parent = tmp;
394
395                 tmp->parent = node->parent;
396                 if (node->parent) {
397                         if (node->parent->left == node)
398                                 node->parent->left = tmp;
399                         else
400                                 node->parent->right = tmp;
401                 } else
402                         mm_root = tmp;
403         } else {
404                 nodecol = node->color;
405
406                 child  = node->left ? node->left : node->right;
407                 parent = node->parent;
408
409                 if (child)
410                         child->parent = parent;
411                 if (parent) {
412                         if (parent->left == node)
413                                 parent->left = child;
414                         else
415                                 parent->right = child;
416                 } else
417                         mm_root = child;
418         }
419
420         free(node);
421
422         if (nodecol == IBV_RED)
423                 return;
424
425         while ((!child || child->color == IBV_BLACK) && child != mm_root) {
426                 if (parent->left == child) {
427                         sib = parent->right;
428
429                         if (sib->color == IBV_RED) {
430                                 parent->color = IBV_RED;
431                                 sib->color    = IBV_BLACK;
432                                 __mm_rotate_left(parent);
433                                 sib = parent->right;
434                         }
435
436                         if ((!sib->left  || sib->left->color  == IBV_BLACK) &&
437                             (!sib->right || sib->right->color == IBV_BLACK)) {
438                                 sib->color = IBV_RED;
439                                 child  = parent;
440                                 parent = child->parent;
441                         } else {
442                                 if (!sib->right || sib->right->color == IBV_BLACK) {
443                                         if (sib->left)
444                                                 sib->left->color = IBV_BLACK;
445                                         sib->color = IBV_RED;
446                                         __mm_rotate_right(sib);
447                                         sib = parent->right;
448                                 }
449
450                                 sib->color    = parent->color;
451                                 parent->color = IBV_BLACK;
452                                 if (sib->right)
453                                         sib->right->color = IBV_BLACK;
454                                 __mm_rotate_left(parent);
455                                 child = mm_root;
456                                 break;
457                         }
458                 } else {
459                         sib = parent->left;
460
461                         if (sib->color == IBV_RED) {
462                                 parent->color = IBV_RED;
463                                 sib->color    = IBV_BLACK;
464                                 __mm_rotate_right(parent);
465                                 sib = parent->left;
466                         }
467
468                         if ((!sib->left  || sib->left->color  == IBV_BLACK) &&
469                             (!sib->right || sib->right->color == IBV_BLACK)) {
470                                 sib->color = IBV_RED;
471                                 child  = parent;
472                                 parent = child->parent;
473                         } else {
474                                 if (!sib->left || sib->left->color == IBV_BLACK) {
475                                         if (sib->right)
476                                                 sib->right->color = IBV_BLACK;
477                                         sib->color = IBV_RED;
478                                         __mm_rotate_left(sib);
479                                         sib = parent->left;
480                                 }
481
482                                 sib->color    = parent->color;
483                                 parent->color = IBV_BLACK;
484                                 if (sib->left)
485                                         sib->left->color = IBV_BLACK;
486                                 __mm_rotate_right(parent);
487                                 child = mm_root;
488                                 break;
489                         }
490                 }
491         }
492
493         if (child)
494                 child->color = IBV_BLACK;
495 }
496
497 static struct ibv_mem_node *__mm_find_start(uintptr_t start, uintptr_t end)
498 {
499         struct ibv_mem_node *node = mm_root;
500
501         while (node) {
502                 if (node->start <= start && node->end >= start)
503                         break;
504
505                 if (node->start < start)
506                         node = node->right;
507                 else
508                         node = node->left;
509         }
510
511         return node;
512 }
513
514 static struct ibv_mem_node *merge_ranges(struct ibv_mem_node *node,
515                                          struct ibv_mem_node *prev)
516 {
517         prev->end = node->end;
518         prev->refcnt = node->refcnt;
519         __mm_remove(node);
520
521         return prev;
522 }
523
524 static struct ibv_mem_node *split_range(struct ibv_mem_node *node,
525                                         uintptr_t cut_line)
526 {
527         struct ibv_mem_node *new_node = NULL;
528
529         new_node = malloc(sizeof *new_node);
530         if (!new_node)
531                 return NULL;
532         new_node->start  = cut_line;
533         new_node->end    = node->end;
534         new_node->refcnt = node->refcnt;
535         node->end  = cut_line - 1;
536         __mm_add(new_node);
537
538         return new_node;
539 }
540
541 static struct ibv_mem_node *get_start_node(uintptr_t start, uintptr_t end,
542                                            int inc)
543 {
544         struct ibv_mem_node *node, *tmp = NULL;
545
546         node = __mm_find_start(start, end);
547         if (node->start < start)
548                 node = split_range(node, start);
549         else {
550                 tmp = __mm_prev(node);
551                 if (tmp && tmp->refcnt == node->refcnt + inc)
552                         node = merge_ranges(node, tmp);
553         }
554         return node;
555 }
556
557 /*
558  * This function is called if madvise() fails to undo merging/splitting
559  * operations performed on the node.
560  */
561 static struct ibv_mem_node *undo_node(struct ibv_mem_node *node,
562                                       uintptr_t start, int inc)
563 {
564         struct ibv_mem_node *tmp = NULL;
565
566         /*
567          * This condition can be true only if we merged this
568          * node with the previous one, so we need to split them.
569         */
570         if (start > node->start) {
571                 tmp = split_range(node, start);
572                 if (tmp) {
573                         node->refcnt += inc;
574                         node = tmp;
575                 } else
576                         return NULL;
577         }
578
579         tmp  =  __mm_prev(node);
580         if (tmp && tmp->refcnt == node->refcnt)
581                 node = merge_ranges(node, tmp);
582
583         tmp  =  __mm_next(node);
584         if (tmp && tmp->refcnt == node->refcnt)
585                 node = merge_ranges(tmp, node);
586
587         return node;
588 }
589
590 static int ibv_madvise_range(void *base, size_t size, int advice)
591 {
592         uintptr_t start, end;
593         struct ibv_mem_node *node, *tmp;
594         int inc;
595         int rolling_back = 0;
596         int ret = 0;
597         unsigned long range_page_size;
598
599         if (!size)
600                 return 0;
601
602         if (huge_page_enabled)
603                 range_page_size = get_page_size(base);
604         else
605                 range_page_size = page_size;
606
607         start = (uintptr_t) base & ~(range_page_size - 1);
608         end   = ((uintptr_t) (base + size + range_page_size - 1) &
609                  ~(range_page_size - 1)) - 1;
610
611         pthread_mutex_lock(&mm_mutex);
612 again:
613         inc = advice == MADV_DONTFORK ? 1 : -1;
614
615         node = get_start_node(start, end, inc);
616         if (!node) {
617                 ret = -1;
618                 goto out;
619         }
620
621         while (node && node->start <= end) {
622                 if (node->end > end) {
623                         if (!split_range(node, end + 1)) {
624                                 ret = -1;
625                                 goto out;
626                         }
627                 }
628
629                 if ((inc == -1 && node->refcnt == 1) ||
630                     (inc ==  1 && node->refcnt == 0)) {
631                         /*
632                          * If this is the first time through the loop,
633                          * and we merged this node with the previous
634                          * one, then we only want to do the madvise()
635                          * on start ... node->end (rather than
636                          * starting at node->start).
637                          *
638                          * Otherwise we end up doing madvise() on
639                          * bigger region than we're being asked to,
640                          * and that may lead to a spurious failure.
641                          */
642                         if (start > node->start)
643                                 ret = madvise((void *) start, node->end - start + 1,
644                                               advice);
645                         else
646                                 ret = madvise((void *) node->start,
647                                               node->end - node->start + 1,
648                                               advice);
649                         if (ret) {
650                                 node = undo_node(node, start, inc);
651
652                                 if (rolling_back || !node)
653                                         goto out;
654
655                                 /* madvise failed, roll back previous changes */
656                                 rolling_back = 1;
657                                 advice = advice == MADV_DONTFORK ?
658                                         MADV_DOFORK : MADV_DONTFORK;
659                                 tmp = __mm_prev(node);
660                                 if (!tmp || start > tmp->end)
661                                         goto out;
662                                 end = tmp->end;
663                                 goto again;
664                         }
665                 }
666
667                 node->refcnt += inc;
668                 node = __mm_next(node);
669         }
670
671         if (node) {
672                 tmp = __mm_prev(node);
673                 if (tmp && node->refcnt == tmp->refcnt)
674                         node = merge_ranges(node, tmp);
675         }
676
677 out:
678         if (rolling_back)
679                 ret = -1;
680
681         pthread_mutex_unlock(&mm_mutex);
682
683         return ret;
684 }
685
686 int ibv_dontfork_range(void *base, size_t size)
687 {
688         if (mm_root)
689                 return ibv_madvise_range(base, size, MADV_DONTFORK);
690         else {
691                 too_late = 1;
692                 return 0;
693         }
694 }
695
696 int ibv_dofork_range(void *base, size_t size)
697 {
698         if (mm_root)
699                 return ibv_madvise_range(base, size, MADV_DOFORK);
700         else {
701                 too_late = 1;
702                 return 0;
703         }
704 }