]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/ofed/opensm/opensm/osm_ucast_dfsssp.c
Merge OpenSSL 1.1.1f.
[FreeBSD/FreeBSD.git] / contrib / ofed / opensm / opensm / osm_ucast_dfsssp.c
1 /*
2  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
3  * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved.
4  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5  * Copyright (c) 2009-2015 ZIH, TU Dresden, Federal Republic of Germany. All rights reserved.
6  * Copyright (C) 2012-2017 Tokyo Institute of Technology. All rights reserved.
7  *
8  * This software is available to you under a choice of one of two
9  * licenses.  You may choose to be licensed under the terms of the GNU
10  * General Public License (GPL) Version 2, available from the file
11  * COPYING in the main directory of this source tree, or the
12  * OpenIB.org BSD license below:
13  *
14  *     Redistribution and use in source and binary forms, with or
15  *     without modification, are permitted provided that the following
16  *     conditions are met:
17  *
18  *      - Redistributions of source code must retain the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer.
21  *
22  *      - Redistributions in binary form must reproduce the above
23  *        copyright notice, this list of conditions and the following
24  *        disclaimer in the documentation and/or other materials
25  *        provided with the distribution.
26  *
27  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
28  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
29  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
30  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
31  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
32  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
33  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34  * SOFTWARE.
35  *
36  */
37
38 /*
39  * Abstract:
40  *    Implementation of OpenSM (deadlock-free) single-source-shortest-path routing
41  *    (with dijkstra algorithm)
42  */
43
44 #if HAVE_CONFIG_H
45 #  include <config.h>
46 #endif                          /* HAVE_CONFIG_H */
47
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <string.h>
51 #include <opensm/osm_file_ids.h>
52 #define FILE_ID OSM_FILE_UCAST_DFSSSP_C
53 #include <opensm/osm_ucast_mgr.h>
54 #include <opensm/osm_opensm.h>
55 #include <opensm/osm_node.h>
56 #include <opensm/osm_multicast.h>
57 #include <opensm/osm_mcast_mgr.h>
58
59 /* "infinity" for dijkstra */
60 #define INF      0x7FFFFFFF
61
62 enum {
63         UNDISCOVERED = 0,
64         DISCOVERED
65 };
66
67 enum {
68         UNKNOWN = 0,
69         GRAY,
70         BLACK,
71 };
72
73 typedef struct link {
74         uint64_t guid;          /* guid of the neighbor behind the link */
75         uint32_t from;          /* base_index in the adjazenz list (start of the link) */
76         uint8_t from_port;      /* port on the base_side (needed for weight update to identify the correct link for multigraphs) */
77         uint32_t to;            /* index of the neighbor in the adjazenz list (end of the link) */
78         uint8_t to_port;        /* port on the side of the neighbor (needed for the LFT) */
79         uint64_t weight;        /* link weight */
80         struct link *next;
81 } link_t;
82
83 typedef struct vertex {
84         /* informations of the fabric */
85         uint64_t guid;
86         uint16_t lid;           /* for lft filling */
87         uint32_t num_hca;       /* numbers of Hca/LIDs on the switch, for weight calculation */
88         link_t *links;
89         uint8_t hops;
90         /* for dijkstra routing */
91         link_t *used_link;      /* link between the vertex discovered before and this vertex */
92         uint64_t distance;      /* distance from source to this vertex */
93         uint8_t state;
94         /* for the binary heap */
95         uint32_t heap_id;
96         /* for LFT writing and debug */
97         osm_switch_t *sw;       /* selfpointer */
98         boolean_t dropped;      /* indicate dropped switches (w/ ucast cache) */
99 } vertex_t;
100
101 typedef struct binary_heap {
102         uint32_t size;          /* size of the heap */
103         vertex_t **nodes;       /* array with pointers to elements of the adj_list */
104 } binary_heap_t;
105
106 typedef struct vltable {
107         uint64_t num_lids;      /* size of the lids array */
108         uint16_t *lids;         /* sorted array of all lids in the subnet */
109         uint8_t *vls;           /* matrix form assignment lid X lid -> virtual lane */
110 } vltable_t;
111
112 typedef struct cdg_link {
113         struct cdg_node *node;
114         uint32_t num_pairs;     /* number of src->dest pairs incremented in path adding step */
115         uint32_t max_len;       /* length of the srcdest array */
116         uint32_t removed;       /* number of pairs removed in path deletion step */
117         uint32_t *srcdest_pairs;
118         struct cdg_link *next;
119 } cdg_link_t;
120
121 /* struct for a node of a binary tree with additional parent pointer */
122 typedef struct cdg_node {
123         uint64_t channelID;     /* unique key consist of src lid + port + dest lid + port */
124         cdg_link_t *linklist;   /* edges to adjazent nodes */
125         uint8_t status;         /* node status in cycle search to avoid recursive function */
126         uint8_t visited;        /* needed to traverse the binary tree */
127         struct cdg_node *pre;   /* to save the path in cycle detection algorithm */
128         struct cdg_node *left, *right, *parent;
129 } cdg_node_t;
130
131 typedef struct dfsssp_context {
132         osm_routing_engine_type_t routing_type;
133         osm_ucast_mgr_t *p_mgr;
134         vertex_t *adj_list;
135         uint32_t adj_list_size;
136         vltable_t *srcdest2vl_table;
137         uint8_t *vl_split_count;
138 } dfsssp_context_t;
139
140 /**************** set initial values for structs **********************
141  **********************************************************************/
142 static inline void set_default_link(link_t * link)
143 {
144         link->guid = 0;
145         link->from = 0;
146         link->from_port = 0;
147         link->to = 0;
148         link->to_port = 0;
149         link->weight = 0;
150         link->next = NULL;
151 }
152
153 static inline void set_default_vertex(vertex_t * vertex)
154 {
155         vertex->guid = 0;
156         vertex->lid = 0;
157         vertex->num_hca = 0;
158         vertex->links = NULL;
159         vertex->hops = 0;
160         vertex->used_link = NULL;
161         vertex->distance = 0;
162         vertex->state = UNDISCOVERED;
163         vertex->heap_id = 0;
164         vertex->sw = NULL;
165         vertex->dropped = FALSE;
166 }
167
168 static inline void set_default_cdg_node(cdg_node_t * node)
169 {
170         node->channelID = 0;
171         node->linklist = NULL;
172         node->status = UNKNOWN;
173         node->visited = 0;
174         node->pre = NULL;
175         node->left = NULL;
176         node->right = NULL;
177         node->parent = NULL;
178 }
179
180 /**********************************************************************
181  **********************************************************************/
182
183 /************ helper functions for heap in dijkstra *******************
184  **********************************************************************/
185 /* returns true if element 1 is smaller than element 2 */
186 static inline uint32_t heap_smaller(binary_heap_t * heap, uint32_t i,
187                                     uint32_t j)
188 {
189         return (heap->nodes[i]->distance < heap->nodes[j]->distance) ? 1 : 0;
190 }
191
192 /* swap two elements */
193 static void heap_exchange(binary_heap_t * heap, uint32_t i, uint32_t j)
194 {
195         uint32_t tmp_heap_id = 0;
196         vertex_t *tmp_node = NULL;
197
198         /* 1. swap the heap_id */
199         tmp_heap_id = heap->nodes[i]->heap_id;
200         heap->nodes[i]->heap_id = heap->nodes[j]->heap_id;
201         heap->nodes[j]->heap_id = tmp_heap_id;
202         /* 2. swap pointers */
203         tmp_node = heap->nodes[i];
204         heap->nodes[i] = heap->nodes[j];
205         heap->nodes[j] = tmp_node;
206 }
207
208 /* changes position of element with parent until children are bigger */
209 static uint32_t heap_up(binary_heap_t * heap, uint32_t i)
210 {
211         uint32_t curr = i, father = 0;
212
213         if (curr > 0) {
214                 father = (curr - 1) >> 1;
215                 while (heap_smaller(heap, curr, father)) {
216                         heap_exchange(heap, curr, father);
217                         /* try to go up when we arent already root */
218                         curr = father;
219                         if (curr > 0)
220                                 father = (curr - 1) >> 1;
221                 }
222         }
223
224         return curr;
225 }
226
227 /* changes position of element with children until parent is smaller */
228 static uint32_t heap_down(binary_heap_t * heap, uint32_t i)
229 {
230         uint32_t curr = i;
231         uint32_t son1 = 0, son2 = 0, smaller_son = 0;
232         uint32_t exchanged = 0;
233
234         do {
235                 son1 = ((curr + 1) << 1) - 1;
236                 son2 = (curr + 1) << 1;
237                 exchanged = 0;
238
239                 /* exchange with smaller son */
240                 if (son1 < heap->size && son2 < heap->size) {
241                         if (heap_smaller(heap, son1, son2))
242                                 smaller_son = son1;
243                         else
244                                 smaller_son = son2;
245                 } else if (son1 < heap->size) {
246                         /* only one son */
247                         smaller_son = son1;
248                 } else {
249                         /* finished */
250                         break;
251                 }
252
253                 /* only exchange when smaller */
254                 if (heap_smaller(heap, smaller_son, curr)) {
255                         heap_exchange(heap, curr, smaller_son);
256                         exchanged = 1;
257                         curr = smaller_son;
258                 }
259         } while (exchanged);
260
261         return curr;
262 }
263
264 /* reheapify element */
265 static inline void heap_heapify(binary_heap_t * heap, uint32_t i)
266 {
267         heap_down(heap, heap_up(heap, i));
268 }
269
270 /* creates heap for graph */
271 static int heap_create(vertex_t * adj_list, uint32_t adj_list_size,
272                        binary_heap_t ** binheap)
273 {
274         binary_heap_t *heap = NULL;
275         uint32_t i = 0;
276
277         /* allocate the memory for the heap object */
278         heap = (binary_heap_t *) malloc(sizeof(binary_heap_t));
279         if (!heap)
280                 return 1;
281
282         /* the heap size is equivalent to the size of the adj_list */
283         heap->size = adj_list_size;
284
285         /* allocate the pointer array, fill with the pointers to the elements of the adj_list and set the initial heap_id */
286         heap->nodes = (vertex_t **) malloc(heap->size * sizeof(vertex_t *));
287         if (!heap->nodes) {
288                 free(heap);
289                 return 1;
290         }
291         for (i = 0; i < heap->size; i++) {
292                 heap->nodes[i] = &adj_list[i];
293                 heap->nodes[i]->heap_id = i;
294         }
295
296         /* sort elements */
297         for (i = heap->size; i > 0; i--)
298                 heap_down(heap, i - 1);
299
300         *binheap = heap;
301         return 0;
302 }
303
304 /* returns current minimum and removes it from heap */
305 static vertex_t *heap_getmin(binary_heap_t * heap)
306 {
307         vertex_t *min = NULL;
308
309         if (heap->size > 0)
310                 min = heap->nodes[0];
311
312         if (min == NULL)
313                 return min;
314
315         if (heap->size > 0) {
316                 if (heap->size > 1) {
317                         heap_exchange(heap, 0, heap->size - 1);
318                         heap->size--;
319                         heap_down(heap, 0);
320                 } else {
321                         heap->size--;
322                 }
323         }
324
325         return min;
326 }
327
328 /* cleanup heap */
329 static void heap_free(binary_heap_t * heap)
330 {
331         if (heap) {
332                 if (heap->nodes) {
333                         free(heap->nodes);
334                         heap->nodes = NULL;
335                 }
336                 free(heap);
337         }
338 }
339
340 /**********************************************************************
341  **********************************************************************/
342
343 /************ helper functions to save src/dest X vl combination ******
344  **********************************************************************/
345 /* compare function of two lids for stdlib qsort */
346 static int cmp_lids(const void *l1, const void *l2)
347 {
348         ib_net16_t lid1 = *((ib_net16_t *) l1), lid2 = *((ib_net16_t *) l2);
349
350         if (lid1 < lid2)
351                 return -1;
352         else if (lid1 > lid2)
353                 return 1;
354         else
355                 return 0;
356 }
357
358 /* use stdlib to sort the lid array */
359 static inline void vltable_sort_lids(vltable_t * vltable)
360 {
361         qsort(vltable->lids, vltable->num_lids, sizeof(ib_net16_t), cmp_lids);
362 }
363
364 /* use stdlib to get index of key in lid array;
365    return -1 if lid isn't found in lids array
366 */
367 static inline int64_t vltable_get_lidindex(ib_net16_t * key, vltable_t * vltable)
368 {
369         ib_net16_t *found_lid = NULL;
370
371         found_lid =
372             (ib_net16_t *) bsearch(key, vltable->lids, vltable->num_lids,
373                                    sizeof(ib_net16_t), cmp_lids);
374         if (found_lid)
375                 return found_lid - vltable->lids;
376         else
377                 return -1;
378 }
379
380 /* get virtual lane from src lid X dest lid combination;
381    return -1 for invalid lids
382 */
383 static int32_t vltable_get_vl(vltable_t * vltable, ib_net16_t slid, ib_net16_t dlid)
384 {
385         int64_t ind1 = vltable_get_lidindex(&slid, vltable);
386         int64_t ind2 = vltable_get_lidindex(&dlid, vltable);
387
388         if (ind1 > -1 && ind2 > -1)
389                 return (int32_t) (vltable->
390                                   vls[ind1 + ind2 * vltable->num_lids]);
391         else
392                 return -1;
393 }
394
395 /* set a virtual lane in the matrix */
396 static inline void vltable_insert(vltable_t * vltable, ib_net16_t slid,
397                                   ib_net16_t dlid, uint8_t vl)
398 {
399         int64_t ind1 = vltable_get_lidindex(&slid, vltable);
400         int64_t ind2 = vltable_get_lidindex(&dlid, vltable);
401
402         if (ind1 > -1 && ind2 > -1)
403                 vltable->vls[ind1 + ind2 * vltable->num_lids] = vl;
404 }
405
406 /* change a number of lanes from lane xy to lane yz */
407 static void vltable_change_vl(vltable_t * vltable, uint8_t from, uint8_t to,
408                               uint64_t count)
409 {
410         uint64_t set = 0, stop = 0;
411         uint64_t ind1 = 0, ind2 = 0;
412
413         for (ind1 = 0; ind1 < vltable->num_lids; ind1++) {
414                 for (ind2 = 0; ind2 < vltable->num_lids; ind2++) {
415                         if (set == count) {
416                                 stop = 1;
417                                 break;
418                         }
419                         if (ind1 != ind2) {
420                                 if (vltable->
421                                     vls[ind1 + ind2 * vltable->num_lids] ==
422                                     from) {
423                                         vltable->vls[ind1 +
424                                                      ind2 * vltable->num_lids] =
425                                             to;
426                                         set++;
427                                 }
428                         }
429                 }
430                 if (stop)
431                         break;
432         }
433 }
434
435 static void vltable_print(osm_ucast_mgr_t * p_mgr, vltable_t * vltable)
436 {
437         uint64_t ind1 = 0, ind2 = 0;
438
439         for (ind1 = 0; ind1 < vltable->num_lids; ind1++) {
440                 for (ind2 = 0; ind2 < vltable->num_lids; ind2++) {
441                         if (ind1 != ind2) {
442                                 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
443                                         "   route from src_lid=%" PRIu16
444                                         " to dest_lid=%" PRIu16 " on vl=%" PRIu8
445                                         "\n", cl_ntoh16(vltable->lids[ind1]),
446                                         cl_ntoh16(vltable->lids[ind2]),
447                                         vltable->vls[ind1 +
448                                                      ind2 * vltable->num_lids]);
449                         }
450                 }
451         }
452 }
453
454 static void vltable_dealloc(vltable_t ** vltable)
455 {
456         if (*vltable) {
457                 if ((*vltable)->lids)
458                         free((*vltable)->lids);
459                 if ((*vltable)->vls)
460                         free((*vltable)->vls);
461                 free(*vltable);
462                 *vltable = NULL;
463         }
464 }
465
466 static int vltable_alloc(vltable_t ** vltable, uint64_t size)
467 {
468         /* allocate VL table and indexing array */
469         *vltable = (vltable_t *) malloc(sizeof(vltable_t));
470         if (!(*vltable))
471                 goto ERROR;
472         (*vltable)->num_lids = size;
473         (*vltable)->lids = (ib_net16_t *) malloc(size * sizeof(ib_net16_t));
474         if (!((*vltable)->lids))
475                 goto ERROR;
476         (*vltable)->vls = (uint8_t *) malloc(size * size * sizeof(uint8_t));
477         if (!((*vltable)->vls))
478                 goto ERROR;
479         memset((*vltable)->vls, OSM_DEFAULT_SL, size * size);
480
481         return 0;
482
483 ERROR:
484         vltable_dealloc(vltable);
485
486         return 1;
487 }
488
489 /**********************************************************************
490  **********************************************************************/
491
492 /************ helper functions to save/manage the channel dep. graph **
493  **********************************************************************/
494 /* update the srcdest array;
495    realloc array (double the size) if size is not large enough
496 */
497 static void set_next_srcdest_pair(cdg_link_t * link, uint32_t srcdest)
498 {
499         uint32_t new_size = 0, start_size = 2;
500         uint32_t *tmp = NULL, *tmp2 = NULL;
501
502         if (link->num_pairs == 0) {
503                 link->srcdest_pairs =
504                     (uint32_t *) malloc(start_size * sizeof(uint32_t));
505                 link->srcdest_pairs[link->num_pairs] = srcdest;
506                 link->max_len = start_size;
507                 link->removed = 0;
508         } else if (link->num_pairs == link->max_len) {
509                 new_size = link->max_len << 1;
510                 tmp = (uint32_t *) malloc(new_size * sizeof(uint32_t));
511                 tmp =
512                     memcpy(tmp, link->srcdest_pairs,
513                            link->max_len * sizeof(uint32_t));
514                 tmp2 = link->srcdest_pairs;
515                 link->srcdest_pairs = tmp;
516                 link->srcdest_pairs[link->num_pairs] = srcdest;
517                 free(tmp2);
518                 link->max_len = new_size;
519         } else {
520                 link->srcdest_pairs[link->num_pairs] = srcdest;
521         }
522         link->num_pairs++;
523 }
524
525 static inline uint32_t get_next_srcdest_pair(cdg_link_t * link, uint32_t index)
526 {
527         return link->srcdest_pairs[index];
528 }
529
530 /* traverse binary tree to find a node */
531 static cdg_node_t *cdg_search(cdg_node_t * root, uint64_t channelID)
532 {
533         while (root) {
534                 if (channelID < root->channelID)
535                         root = root->left;
536                 else if (channelID > root->channelID)
537                         root = root->right;
538                 else if (channelID == root->channelID)
539                         return root;
540         }
541         return NULL;
542 }
543
544 /* insert new node into the binary tree */
545 static void cdg_insert(cdg_node_t ** root, cdg_node_t * new_node)
546 {
547         cdg_node_t *current = *root;
548
549         if (!current) {
550                 current = new_node;
551                 *root = current;
552                 return;
553         }
554
555         while (current) {
556                 if (new_node->channelID < current->channelID) {
557                         if (current->left) {
558                                 current = current->left;
559                         } else {
560                                 current->left = new_node;
561                                 new_node->parent = current;
562                                 break;
563                         }
564                 } else if (new_node->channelID > current->channelID) {
565                         if (current->right) {
566                                 current = current->right;
567                         } else {
568                                 current->right = new_node;
569                                 new_node->parent = current;
570                                 break;
571                         }
572                 } else if (new_node->channelID == current->channelID) {
573                         /* not really possible, maybe programming error */
574                         break;
575                 }
576         }
577 }
578
579 static void cdg_node_dealloc(cdg_node_t * node)
580 {
581         cdg_link_t *link = node->linklist, *tmp = NULL;
582
583         /* dealloc linklist */
584         while (link) {
585                 tmp = link;
586                 link = link->next;
587
588                 if (tmp->num_pairs)
589                         free(tmp->srcdest_pairs);
590                 free(tmp);
591         }
592         /* dealloc node */
593         free(node);
594 }
595
596 static void cdg_dealloc(cdg_node_t ** root)
597 {
598         cdg_node_t *current = *root;
599
600         while (current) {
601                 if (current->left) {
602                         current = current->left;
603                 } else if (current->right) {
604                         current = current->right;
605                 } else {
606                         if (current->parent == NULL) {
607                                 cdg_node_dealloc(current);
608                                 *root = NULL;
609                                 break;
610                         }
611                         if (current->parent->left == current) {
612                                 current = current->parent;
613                                 cdg_node_dealloc(current->left);
614                                 current->left = NULL;
615                         } else if (current->parent->right == current) {
616                                 current = current->parent;
617                                 cdg_node_dealloc(current->right);
618                                 current->right = NULL;
619                         }
620                 }
621         }
622 }
623
624 /* search for a edge in the cdg which should be removed to break a cycle */
625 static cdg_link_t *get_weakest_link_in_cycle(cdg_node_t * cycle)
626 {
627         cdg_node_t *current = cycle, *node_with_weakest_link = NULL;
628         cdg_link_t *link = NULL, *weakest_link = NULL;
629
630         link = current->linklist;
631         while (link) {
632                 if (link->node->status == GRAY) {
633                         weakest_link = link;
634                         node_with_weakest_link = current;
635                         current = link->node;
636                         break;
637                 }
638                 link = link->next;
639         }
640
641         while (1) {
642                 current->status = UNKNOWN;
643                 link = current->linklist;
644                 while (link) {
645                         if (link->node->status == GRAY) {
646                                 if ((link->num_pairs - link->removed) <
647                                     (weakest_link->num_pairs -
648                                      weakest_link->removed)) {
649                                         weakest_link = link;
650                                         node_with_weakest_link = current;
651                                 }
652                                 current = link->node;
653                                 break;
654                         }
655                         link = link->next;
656                 }
657                 /* if complete cycle is traversed */
658                 if (current == cycle) {
659                         current->status = UNKNOWN;
660                         break;
661                 }
662         }
663
664         if (node_with_weakest_link->linklist == weakest_link) {
665                 node_with_weakest_link->linklist = weakest_link->next;
666         } else {
667                 link = node_with_weakest_link->linklist;
668                 while (link) {
669                         if (link->next == weakest_link) {
670                                 link->next = weakest_link->next;
671                                 break;
672                         }
673                         link = link->next;
674                 }
675         }
676
677         return weakest_link;
678 }
679
680 /* search for nodes in the cdg not yet reached in the cycle search process;
681    (some nodes are unreachable, e.g. a node is a source or the cdg has not connected parts)
682 */
683 static cdg_node_t *get_next_cdg_node(cdg_node_t * root)
684 {
685         cdg_node_t *current = root, *res = NULL;
686
687         while (current) {
688                 current->visited = 1;
689                 if (current->status == UNKNOWN) {
690                         res = current;
691                         break;
692                 }
693                 if (current->left && !current->left->visited) {
694                         current = current->left;
695                 } else if (current->right && !current->right->visited) {
696                         current = current->right;
697                 } else {
698                         if (current->left)
699                                 current->left->visited = 0;
700                         if (current->right)
701                                 current->right->visited = 0;
702                         if (current->parent == NULL)
703                                 break;
704                         else
705                                 current = current->parent;
706                 }
707         }
708
709         /* Clean up */
710         while (current) {
711                 current->visited = 0;
712                 if (current->left)
713                         current->left->visited = 0;
714                 if (current->right)
715                         current->right->visited = 0;
716                 current = current->parent;
717         }
718
719         return res;
720 }
721
722 /* make a DFS on the cdg to check for a cycle */
723 static cdg_node_t *search_cycle_in_channel_dep_graph(cdg_node_t * cdg,
724                                                      cdg_node_t * start_node)
725 {
726         cdg_node_t *cycle = NULL;
727         cdg_node_t *current = start_node, *next_node = NULL, *tmp = NULL;
728         cdg_link_t *link = NULL;
729
730         while (current) {
731                 current->status = GRAY;
732                 link = current->linklist;
733                 next_node = NULL;
734                 while (link) {
735                         if (link->node->status == UNKNOWN) {
736                                 next_node = link->node;
737                                 break;
738                         }
739                         if (link->node->status == GRAY) {
740                                 cycle = link->node;
741                                 goto Exit;
742                         }
743                         link = link->next;
744                 }
745                 if (next_node) {
746                         next_node->pre = current;
747                         current = next_node;
748                 } else {
749                         /* found a sink in the graph, go to last node */
750                         current->status = BLACK;
751
752                         /* srcdest_pairs of this node aren't relevant, free the allocated memory */
753                         link = current->linklist;
754                         while (link) {
755                                 if (link->num_pairs)
756                                         free(link->srcdest_pairs);
757                                 link->srcdest_pairs = NULL;
758                                 link->num_pairs = 0;
759                                 link->removed = 0;
760                                 link = link->next;
761                         }
762
763                         if (current->pre) {
764                                 tmp = current;
765                                 current = current->pre;
766                                 tmp->pre = NULL;
767                         } else {
768                                 /* search for other subgraphs in cdg */
769                                 current = get_next_cdg_node(cdg);
770                                 if (!current)
771                                         break;  /* all relevant nodes traversed, no more cycles found */
772                         }
773                 }
774         }
775
776 Exit:
777         return cycle;
778 }
779
780 /* calculate the path from source to destination port;
781    new channels are added directly to the cdg
782 */
783 static int update_channel_dep_graph(cdg_node_t ** cdg_root,
784                                     osm_port_t * src_port, uint16_t slid,
785                                     osm_port_t * dest_port, uint16_t dlid)
786 {
787         osm_node_t *local_node = NULL, *remote_node = NULL;
788         uint16_t local_lid = 0, remote_lid = 0;
789         uint32_t srcdest = 0;
790         uint8_t local_port = 0, remote_port = 0;
791         uint64_t channelID = 0;
792
793         cdg_node_t *channel_head = NULL, *channel = NULL, *last_channel = NULL;
794         cdg_link_t *linklist = NULL;
795
796         /* set the identifier for the src/dest pair to save this on each edge of the cdg */
797         srcdest = (((uint32_t) slid) << 16) + ((uint32_t) dlid);
798
799         channel_head = (cdg_node_t *) malloc(sizeof(cdg_node_t));
800         if (!channel_head)
801                 goto ERROR;
802         set_default_cdg_node(channel_head);
803         last_channel = channel_head;
804
805         /* if src is a Hca, then the channel from Hca to switch would be a source in the graph
806            sources can't be part of a cycle -> skip this channel
807          */
808         remote_node =
809             osm_node_get_remote_node(src_port->p_node,
810                                      src_port->p_physp->port_num, &remote_port);
811
812         while (remote_node && remote_node->sw) {
813                 local_node = remote_node;
814                 local_port = local_node->sw->new_lft[dlid];
815                 /* sanity check: local_port must be set or routing is broken */
816                 if (local_port == OSM_NO_PATH)
817                         goto ERROR;
818                 local_lid = cl_ntoh16(osm_node_get_base_lid(local_node, 0));
819                 /* each port belonging to a switch has lmc==0 -> get_base_lid is fine
820                    (local/remote port in this function are always part of a switch)
821                  */
822
823                 remote_node =
824                     osm_node_get_remote_node(local_node, local_port,
825                                              &remote_port);
826                 /* if remote_node is a Hca, then the last channel from switch to Hca would be a sink in the cdg -> skip */
827                 if (!remote_node || !remote_node->sw)
828                         break;
829                 remote_lid = cl_ntoh16(osm_node_get_base_lid(remote_node, 0));
830
831                 channelID =
832                     (((uint64_t) local_lid) << 48) +
833                     (((uint64_t) local_port) << 32) +
834                     (((uint64_t) remote_lid) << 16) + ((uint64_t) remote_port);
835                 channel = cdg_search(*cdg_root, channelID);
836                 if (channel) {
837                         /* check whether last channel has connection to this channel, i.e. subpath already exists in cdg */
838                         linklist = last_channel->linklist;
839                         while (linklist && linklist->node != channel
840                                && linklist->next)
841                                 linklist = linklist->next;
842                         /* if there is no connection, add one */
843                         if (linklist) {
844                                 if (linklist->node == channel) {
845                                         set_next_srcdest_pair(linklist,
846                                                               srcdest);
847                                 } else {
848                                         linklist->next =
849                                             (cdg_link_t *)
850                                             malloc(sizeof(cdg_link_t));
851                                         if (!linklist->next)
852                                                 goto ERROR;
853                                         linklist = linklist->next;
854                                         linklist->node = channel;
855                                         linklist->num_pairs = 0;
856                                         linklist->srcdest_pairs = NULL;
857                                         set_next_srcdest_pair(linklist,
858                                                               srcdest);
859                                         linklist->next = NULL;
860                                 }
861                         } else {
862                                 /* either this is the first channel of the path, or the last channel was a new channel, or last channel was a sink */
863                                 last_channel->linklist =
864                                     (cdg_link_t *) malloc(sizeof(cdg_link_t));
865                                 if (!last_channel->linklist)
866                                         goto ERROR;
867                                 last_channel->linklist->node = channel;
868                                 last_channel->linklist->num_pairs = 0;
869                                 last_channel->linklist->srcdest_pairs = NULL;
870                                 set_next_srcdest_pair(last_channel->linklist,
871                                                       srcdest);
872                                 last_channel->linklist->next = NULL;
873                         }
874                 } else {
875                         /* create new channel */
876                         channel = (cdg_node_t *) malloc(sizeof(cdg_node_t));
877                         if (!channel)
878                                 goto ERROR;
879                         set_default_cdg_node(channel);
880                         channel->channelID = channelID;
881                         cdg_insert(cdg_root, channel);
882
883                         /* go to end of link list of last channel */
884                         linklist = last_channel->linklist;
885                         while (linklist && linklist->next)
886                                 linklist = linklist->next;
887                         if (linklist) {
888                                 /* update last link of an existing channel */
889                                 linklist->next =
890                                     (cdg_link_t *) malloc(sizeof(cdg_link_t));
891                                 if (!linklist->next)
892                                         goto ERROR;
893                                 linklist = linklist->next;
894                                 linklist->node = channel;
895                                 linklist->num_pairs = 0;
896                                 linklist->srcdest_pairs = NULL;
897                                 set_next_srcdest_pair(linklist, srcdest);
898                                 linklist->next = NULL;
899                         } else {
900                                 /* either this is the first channel of the path, or the last channel was a new channel, or last channel was a sink */
901                                 last_channel->linklist =
902                                     (cdg_link_t *) malloc(sizeof(cdg_link_t));
903                                 if (!last_channel->linklist)
904                                         goto ERROR;
905                                 last_channel->linklist->node = channel;
906                                 last_channel->linklist->num_pairs = 0;
907                                 last_channel->linklist->srcdest_pairs = NULL;
908                                 set_next_srcdest_pair(last_channel->linklist,
909                                                       srcdest);
910                                 last_channel->linklist->next = NULL;
911                         }
912                 }
913                 last_channel = channel;
914         }
915
916         if (channel_head->linklist) {
917                 if (channel_head->linklist->srcdest_pairs)
918                         free(channel_head->linklist->srcdest_pairs);
919                 free(channel_head->linklist);
920         }
921         free(channel_head);
922
923         return 0;
924
925 ERROR:
926         /* cleanup data and exit */
927         if (channel_head) {
928                 if (channel_head->linklist)
929                         free(channel_head->linklist);
930                 free(channel_head);
931         }
932
933         return 1;
934 }
935
936 /* calculate the path from source to destination port;
937    the links in the cdg representing this path are decremented to simulate the removal
938 */
939 static int remove_path_from_cdg(cdg_node_t ** cdg_root, osm_port_t * src_port,
940                                 uint16_t slid, osm_port_t * dest_port,
941                                 uint16_t dlid)
942 {
943         osm_node_t *local_node = NULL, *remote_node = NULL;
944         uint16_t local_lid = 0, remote_lid = 0;
945         uint8_t local_port = 0, remote_port = 0;
946         uint64_t channelID = 0;
947
948         cdg_node_t *channel_head = NULL, *channel = NULL, *last_channel = NULL;
949         cdg_link_t *linklist = NULL;
950
951         channel_head = (cdg_node_t *) malloc(sizeof(cdg_node_t));
952         if (!channel_head)
953                 goto ERROR;
954         set_default_cdg_node(channel_head);
955         last_channel = channel_head;
956
957         /* if src is a Hca, then the channel from Hca to switch would be a source in the graph
958            sources can't be part of a cycle -> skip this channel
959          */
960         remote_node =
961             osm_node_get_remote_node(src_port->p_node,
962                                      src_port->p_physp->port_num, &remote_port);
963
964         while (remote_node && remote_node->sw) {
965                 local_node = remote_node;
966                 local_port = local_node->sw->new_lft[dlid];
967                 /* sanity check: local_port must be set or routing is broken */
968                 if (local_port == OSM_NO_PATH)
969                         goto ERROR;
970                 local_lid = cl_ntoh16(osm_node_get_base_lid(local_node, 0));
971
972                 remote_node =
973                     osm_node_get_remote_node(local_node, local_port,
974                                              &remote_port);
975                 /* if remote_node is a Hca, then the last channel from switch to Hca would be a sink in the cdg -> skip */
976                 if (!remote_node || !remote_node->sw)
977                         break;
978                 remote_lid = cl_ntoh16(osm_node_get_base_lid(remote_node, 0));
979
980                 channelID =
981                     (((uint64_t) local_lid) << 48) +
982                     (((uint64_t) local_port) << 32) +
983                     (((uint64_t) remote_lid) << 16) + ((uint64_t) remote_port);
984                 channel = cdg_search(*cdg_root, channelID);
985                 if (channel) {
986                         /* check whether last channel has connection to this channel, i.e. subpath already exists in cdg */
987                         linklist = last_channel->linklist;
988                         while (linklist && linklist->node != channel
989                                && linklist->next)
990                                 linklist = linklist->next;
991                         /* remove the srcdest from the link */
992                         if (linklist) {
993                                 if (linklist->node == channel) {
994                                         linklist->removed++;
995                                 } else {
996                                         /* may happen if the link is missing (thru cycle detect algorithm) */
997                                 }
998                         } else {
999                                 /* may happen if the link is missing (thru cycle detect algorithm or last_channel==channel_head (dummy channel)) */
1000                         }
1001                 } else {
1002                         /* must be an error, channels for the path are added before, so a missing channel would be a corrupt data structure */
1003                         goto ERROR;
1004                 }
1005                 last_channel = channel;
1006         }
1007
1008         if (channel_head->linklist)
1009                 free(channel_head->linklist);
1010         free(channel_head);
1011
1012         return 0;
1013
1014 ERROR:
1015         /* cleanup data and exit */
1016         if (channel_head) {
1017                 if (channel_head->linklist)
1018                         free(channel_head->linklist);
1019                 free(channel_head);
1020         }
1021
1022         return 1;
1023 }
1024
1025 /**********************************************************************
1026  **********************************************************************/
1027
1028 /************ helper functions to generate an ordered list of ports ***
1029  ************ (functions copied from osm_ucast_mgr.c and modified) ****
1030  **********************************************************************/
1031 static void add_sw_endports_to_order_list(osm_switch_t * sw,
1032                                           osm_ucast_mgr_t * m,
1033                                           cl_qmap_t * guid_tbl,
1034                                           boolean_t add_guids)
1035 {
1036         osm_port_t *port;
1037         ib_net64_t port_guid;
1038         uint64_t sw_guid;
1039         osm_physp_t *p;
1040         int i;
1041         boolean_t found;
1042
1043         for (i = 1; i < sw->num_ports; i++) {
1044                 p = osm_node_get_physp_ptr(sw->p_node, i);
1045                 if (p && p->p_remote_physp && !p->p_remote_physp->p_node->sw) {
1046                         port_guid = p->p_remote_physp->port_guid;
1047                         /* check if link is healthy, otherwise ignore CA */
1048                         if (!osm_link_is_healthy(p)) {
1049                                 sw_guid =
1050                                     cl_ntoh64(osm_node_get_node_guid
1051                                               (sw->p_node));
1052                                 OSM_LOG(m->p_log, OSM_LOG_INFO,
1053                                         "WRN AD40: ignoring CA due to unhealthy"
1054                                         " link from switch 0x%016" PRIx64
1055                                         " port %" PRIu8 " to CA 0x%016" PRIx64
1056                                         "\n", sw_guid, i, cl_ntoh64(port_guid));
1057                         }
1058                         port = osm_get_port_by_guid(m->p_subn, port_guid);
1059                         if (!port)
1060                                 continue;
1061                         if (!cl_is_qmap_empty(guid_tbl)) {
1062                                 found = (cl_qmap_get(guid_tbl, port_guid)
1063                                          != cl_qmap_end(guid_tbl));
1064                                 if ((add_guids && !found)
1065                                     || (!add_guids && found))
1066                                         continue;
1067                         }
1068                         if (!cl_is_item_in_qlist(&m->port_order_list,
1069                                                  &port->list_item))
1070                                 cl_qlist_insert_tail(&m->port_order_list,
1071                                                      &port->list_item);
1072                         else
1073                                 OSM_LOG(m->p_log, OSM_LOG_INFO,
1074                                         "WRN AD37: guid 0x%016" PRIx64
1075                                         " already in list\n", port_guid);
1076                 }
1077         }
1078 }
1079
1080 static void add_guid_to_order_list(uint64_t guid, osm_ucast_mgr_t * m)
1081 {
1082         osm_port_t *port = osm_get_port_by_guid(m->p_subn, cl_hton64(guid));
1083
1084         if (!port) {
1085                  OSM_LOG(m->p_log, OSM_LOG_DEBUG,
1086                          "port guid not found: 0x%016" PRIx64 "\n", guid);
1087         }
1088
1089         if (!cl_is_item_in_qlist(&m->port_order_list, &port->list_item))
1090                 cl_qlist_insert_tail(&m->port_order_list, &port->list_item);
1091         else
1092                 OSM_LOG(m->p_log, OSM_LOG_INFO,
1093                         "WRN AD38: guid 0x%016" PRIx64 " already in list\n",
1094                         guid);
1095 }
1096
1097 /* compare function of #Hca attached to a switch for stdlib qsort */
1098 static int cmp_num_hca(const void * l1, const void * l2)
1099 {
1100         vertex_t *sw1 = *((vertex_t **) l1);
1101         vertex_t *sw2 = *((vertex_t **) l2);
1102         uint32_t num_hca1 = 0, num_hca2 = 0;
1103
1104         if (sw1)
1105                 num_hca1 = sw1->num_hca;
1106         if (sw2)
1107                 num_hca2 = sw2->num_hca;
1108
1109         if (num_hca1 > num_hca2)
1110                 return -1;
1111         else if (num_hca1 < num_hca2)
1112                 return 1;
1113         else
1114                 return 0;
1115 }
1116
1117 /* use stdlib to sort the switch array depending on num_hca */
1118 static inline void sw_list_sort_by_num_hca(vertex_t ** sw_list,
1119                                            uint32_t sw_list_size)
1120 {
1121         qsort(sw_list, sw_list_size, sizeof(vertex_t *), cmp_num_hca);
1122 }
1123
1124 /**********************************************************************
1125  **********************************************************************/
1126
1127 /************ helper functions to manage a map of CN and I/O guids ****
1128  **********************************************************************/
1129 static int add_guid_to_map(void * cxt, uint64_t guid, char * p)
1130 {
1131         cl_qmap_t *map = cxt;
1132         name_map_item_t *item;
1133         name_map_item_t *inserted_item;
1134
1135         item = malloc(sizeof(*item));
1136         if (!item)
1137                 return -1;
1138
1139         item->guid = cl_hton64(guid);   /* internal: network byte order */
1140         item->name = NULL;              /* name isn't needed */
1141         inserted_item = (name_map_item_t *) cl_qmap_insert(map, item->guid, &item->item);
1142         if (inserted_item != item)
1143                 free(item);
1144
1145         return 0;
1146 }
1147
1148 static void destroy_guid_map(cl_qmap_t * guid_tbl)
1149 {
1150         name_map_item_t *p_guid = NULL, *p_next_guid = NULL;
1151
1152         p_next_guid = (name_map_item_t *) cl_qmap_head(guid_tbl);
1153         while (p_next_guid != (name_map_item_t *) cl_qmap_end(guid_tbl)) {
1154                 p_guid = p_next_guid;
1155                 p_next_guid = (name_map_item_t *) cl_qmap_next(&p_guid->item);
1156                 free(p_guid);
1157         }
1158         cl_qmap_remove_all(guid_tbl);
1159 }
1160
1161 /**********************************************************************
1162  **********************************************************************/
1163
1164 static void dfsssp_print_graph(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1165                                uint32_t size)
1166 {
1167         uint32_t i = 0, c = 0;
1168         link_t *link = NULL;
1169
1170         /* index 0 is for the source in dijkstra -> ignore */
1171         for (i = 1; i < size; i++) {
1172                 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG, "adj_list[%" PRIu32 "]:\n",
1173                         i);
1174                 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1175                         "   guid = 0x%" PRIx64 " lid = %" PRIu16 " (%s)\n",
1176                         adj_list[i].guid, adj_list[i].lid,
1177                         adj_list[i].sw->p_node->print_desc);
1178                 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1179                         "   num_hca = %" PRIu32 "\n", adj_list[i].num_hca);
1180
1181                 c = 1;
1182                 for (link = adj_list[i].links; link != NULL;
1183                      link = link->next, c++) {
1184                         OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1185                                 "   link[%" PRIu32 "]:\n", c);
1186                         OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1187                                 "      to guid = 0x%" PRIx64 " (%s) port %"
1188                                 PRIu8 "\n", link->guid,
1189                                 adj_list[link->to].sw->p_node->print_desc,
1190                                 link->to_port);
1191                         OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1192                                 "      weight on this link = %" PRIu64 "\n",
1193                                 link->weight);
1194                 }
1195         }
1196 }
1197
1198 /* predefine, to use this in next function */
1199 static void dfsssp_context_destroy(void *context);
1200 static int dijkstra(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1201                     uint32_t adj_list_size, osm_port_t * port, uint16_t lid);
1202
1203 /* traverse subnet to gather information about the connected switches */
1204 static int dfsssp_build_graph(void *context)
1205 {
1206         dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
1207         osm_ucast_mgr_t *p_mgr = (osm_ucast_mgr_t *) (dfsssp_ctx->p_mgr);
1208
1209         cl_qmap_t *port_tbl = &p_mgr->p_subn->port_guid_tbl;    /* 1 management port per switch + 1 or 2 ports for each Hca */
1210         osm_port_t *p_port = NULL;
1211         cl_qmap_t *sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
1212         cl_map_item_t *item = NULL;
1213         osm_switch_t *sw = NULL;
1214         osm_node_t *remote_node = NULL;
1215         uint8_t port = 0, remote_port = 0;
1216         uint32_t i = 0, j = 0, err = 0, undiscov = 0, max_num_undiscov = 0;
1217         uint64_t total_num_hca = 0;
1218         vertex_t *adj_list = NULL;
1219         osm_physp_t *p_physp = NULL;
1220         link_t *link = NULL, *head = NULL;
1221         uint32_t num_sw = 0, adj_list_size = 0;
1222         uint8_t lmc = 0;
1223         uint16_t sm_lid = 0;
1224
1225         OSM_LOG_ENTER(p_mgr->p_log);
1226         OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
1227                 "Building graph for df-/sssp routing\n");
1228
1229         /* if this pointer isn't NULL, this is a reroute step;
1230            old context will be destroyed (adj_list and srcdest2vl_table)
1231          */
1232         if (dfsssp_ctx->adj_list)
1233                 dfsssp_context_destroy(context);
1234
1235         num_sw = cl_qmap_count(sw_tbl);
1236         adj_list_size = num_sw + 1;
1237         /* allocate an adjazenz list (array), 0. element is reserved for the source (Hca) in the routing algo, others are switches */
1238         adj_list = (vertex_t *) malloc(adj_list_size * sizeof(vertex_t));
1239         if (!adj_list) {
1240                 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1241                         "ERR AD02: cannot allocate memory for adj_list\n");
1242                 goto ERROR;
1243         }
1244         for (i = 0; i < adj_list_size; i++)
1245                 set_default_vertex(&adj_list[i]);
1246
1247         dfsssp_ctx->adj_list = adj_list;
1248         dfsssp_ctx->adj_list_size = adj_list_size;
1249
1250         /* count the total number of Hca / LIDs (for lmc>0) in the fabric;
1251            even include base/enhanced switch port 0; base SP0 will have lmc=0
1252          */
1253         for (item = cl_qmap_head(port_tbl); item != cl_qmap_end(port_tbl);
1254              item = cl_qmap_next(item)) {
1255                 p_port = (osm_port_t *) item;
1256                 if (osm_node_get_type(p_port->p_node) == IB_NODE_TYPE_CA ||
1257                     osm_node_get_type(p_port->p_node) == IB_NODE_TYPE_SWITCH) {
1258                         lmc = osm_port_get_lmc(p_port);
1259                         total_num_hca += (1 << lmc);
1260                 }
1261         }
1262
1263         i = 1;                  /* fill adj_list -> start with index 1 */
1264         for (item = cl_qmap_head(sw_tbl); item != cl_qmap_end(sw_tbl);
1265              item = cl_qmap_next(item), i++) {
1266                 sw = (osm_switch_t *) item;
1267                 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1268                         "Processing switch with GUID 0x%" PRIx64 "\n",
1269                         cl_ntoh64(osm_node_get_node_guid(sw->p_node)));
1270
1271                 adj_list[i].guid =
1272                     cl_ntoh64(osm_node_get_node_guid(sw->p_node));
1273                 adj_list[i].lid =
1274                     cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0));
1275                 adj_list[i].sw = sw;
1276
1277                 link = (link_t *) malloc(sizeof(link_t));
1278                 if (!link) {
1279                         OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1280                                 "ERR AD03: cannot allocate memory for a link\n");
1281                         goto ERROR;
1282                 }
1283                 head = link;
1284                 head->next = NULL;
1285
1286                 /* add SP0 to number of CA connected to a switch */
1287                 lmc = osm_node_get_lmc(sw->p_node, 0);
1288                 adj_list[i].num_hca += (1 << lmc);
1289
1290                 /* iterate over all ports in the switch, start with port 1 (port 0 is a management port) */
1291                 for (port = 1; port < sw->num_ports; port++) {
1292                         /* get the node behind the port */
1293                         remote_node =
1294                             osm_node_get_remote_node(sw->p_node, port,
1295                                                      &remote_port);
1296                         /* if there is no remote node on this port or it's the same switch -> try next port */
1297                         if (!remote_node || remote_node->sw == sw)
1298                                 continue;
1299                         /* make sure the link is healthy */
1300                         p_physp = osm_node_get_physp_ptr(sw->p_node, port);
1301                         if (!p_physp || !osm_link_is_healthy(p_physp))
1302                                 continue;
1303                         /* if there is a Hca connected -> count and cycle */
1304                         if (!remote_node->sw) {
1305                                 lmc = osm_node_get_lmc(remote_node, (uint32_t)remote_port);
1306                                 adj_list[i].num_hca += (1 << lmc);
1307                                 continue;
1308                         }
1309                         OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1310                                 "Node 0x%" PRIx64 ", remote node 0x%" PRIx64
1311                                 ", port %" PRIu8 ", remote port %" PRIu8 "\n",
1312                                 cl_ntoh64(osm_node_get_node_guid(sw->p_node)),
1313                                 cl_ntoh64(osm_node_get_node_guid(remote_node)),
1314                                 port, remote_port);
1315
1316                         link->next = (link_t *) malloc(sizeof(link_t));
1317                         if (!link->next) {
1318                                 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1319                                         "ERR AD08: cannot allocate memory for a link\n");
1320                                 while (head) {
1321                                         link = head;
1322                                         head = head->next;
1323                                         free(link);
1324                                 }
1325                                 goto ERROR;
1326                         }
1327                         link = link->next;
1328                         set_default_link(link);
1329                         link->guid =
1330                             cl_ntoh64(osm_node_get_node_guid(remote_node));
1331                         link->from = i;
1332                         link->from_port = port;
1333                         link->to_port = remote_port;
1334                         link->weight = total_num_hca * total_num_hca;   /* initialize with P^2 to force shortest paths */
1335                 }
1336
1337                 adj_list[i].links = head->next;
1338                 free(head);
1339         }
1340         /* connect the links with it's second adjacent node in the list */
1341         for (i = 1; i < adj_list_size; i++) {
1342                 link = adj_list[i].links;
1343                 while (link) {
1344                         for (j = 1; j < adj_list_size; j++) {
1345                                 if (link->guid == adj_list[j].guid) {
1346                                         link->to = j;
1347                                         break;
1348                                 }
1349                         }
1350                         link = link->next;
1351                 }
1352         }
1353         /* do one dry run to determine connectivity issues */
1354         sm_lid = p_mgr->p_subn->master_sm_base_lid;
1355         p_port = osm_get_port_by_lid(p_mgr->p_subn, sm_lid);
1356         err = dijkstra(p_mgr, adj_list, adj_list_size, p_port, sm_lid);
1357         if (err) {
1358                 goto ERROR;
1359         } else {
1360                 /* if sm is running on a switch, then dijkstra doesn't
1361                    initialize the used_link for this switch
1362                  */
1363                 if (osm_node_get_type(p_port->p_node) != IB_NODE_TYPE_CA)
1364                         max_num_undiscov = 1;
1365                 for (i = 1; i < adj_list_size; i++)
1366                         undiscov += (adj_list[i].used_link) ? 0 : 1;
1367                 if (max_num_undiscov < undiscov) {
1368                         OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1369                                 "ERR AD0C: unsupported network state (detached"
1370                                 " and inaccessible switches found; gracefully"
1371                                 " shutdown this routing engine)\n");
1372                         goto ERROR;
1373                 }
1374         }
1375         /* print the discovered graph */
1376         if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG))
1377                 dfsssp_print_graph(p_mgr, adj_list, adj_list_size);
1378
1379         OSM_LOG_EXIT(p_mgr->p_log);
1380         return 0;
1381
1382 ERROR:
1383         dfsssp_context_destroy(context);
1384         return -1;
1385 }
1386
1387 static void print_routes(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1388                          uint32_t adj_list_size, osm_port_t * port)
1389 {
1390         uint32_t i = 0, j = 0;
1391
1392         for (i = 1; i < adj_list_size; i++) {
1393                 if (adj_list[i].state == DISCOVERED) {
1394                         OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1395                                 "Route from 0x%" PRIx64 " (%s) to 0x%" PRIx64
1396                                 " (%s):\n", adj_list[i].guid,
1397                                 adj_list[i].sw->p_node->print_desc,
1398                                 cl_ntoh64(osm_node_get_node_guid(port->p_node)),
1399                                 port->p_node->print_desc);
1400                         j = i;
1401                         while (adj_list[j].used_link) {
1402                                 if (j > 0) {
1403                                         OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1404                                                 "   0x%" PRIx64
1405                                                 " (%s) routes thru port %" PRIu8
1406                                                 "\n", adj_list[j].guid,
1407                                                 adj_list[j].sw->p_node->
1408                                                 print_desc,
1409                                                 adj_list[j].used_link->to_port);
1410                                 } else {
1411                                         OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1412                                                 "   0x%" PRIx64
1413                                                 " (%s) routes thru port %" PRIu8
1414                                                 "\n", adj_list[j].guid,
1415                                                 port->p_node->print_desc,
1416                                                 adj_list[j].used_link->to_port);
1417                                 }
1418                                 j = adj_list[j].used_link->from;
1419                         }
1420                 }
1421         }
1422 }
1423
1424 /* dijkstra step from one source to all switches in the df-/sssp graph */
1425 static int dijkstra(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1426                     uint32_t adj_list_size, osm_port_t * port, uint16_t lid)
1427 {
1428         uint32_t i = 0, j = 0, index = 0;
1429         osm_node_t *remote_node = NULL;
1430         uint8_t remote_port = 0;
1431         vertex_t *current = NULL;
1432         link_t *link = NULL;
1433         uint64_t guid = 0;
1434         binary_heap_t *heap = NULL;
1435         int err = 0;
1436
1437         OSM_LOG_ENTER(p_mgr->p_log);
1438
1439         /* reset all switches for new round with a new source for dijkstra */
1440         for (i = 1; i < adj_list_size; i++) {
1441                 adj_list[i].hops = 0;
1442                 adj_list[i].used_link = NULL;
1443                 adj_list[i].distance = INF;
1444                 adj_list[i].state = UNDISCOVERED;
1445         }
1446
1447         /* if behind port is a Hca -> set adj_list[0] */
1448         if (osm_node_get_type(port->p_node) == IB_NODE_TYPE_CA) {
1449                 /* save old link to prevent many mallocs after set_default_... */
1450                 link = adj_list[0].links;
1451                 /* initialize adj_list[0] (the source for the routing, a Hca) */
1452                 set_default_vertex(&adj_list[0]);
1453                 adj_list[0].guid =
1454                     cl_ntoh64(osm_node_get_node_guid(port->p_node));
1455                 adj_list[0].lid = lid;
1456                 index = 0;
1457                 /* write saved link back to new adj_list[0] */
1458                 adj_list[0].links = link;
1459
1460                 /* initialize link to neighbor for adj_list[0];
1461                    make sure the link is healthy
1462                  */
1463                 if (port->p_physp && osm_link_is_healthy(port->p_physp)) {
1464                         remote_node =
1465                             osm_node_get_remote_node(port->p_node,
1466                                                      port->p_physp->port_num,
1467                                                      &remote_port);
1468                         /* if there is no remote node on this port or it's the same Hca -> ignore */
1469                         if (remote_node
1470                             && (osm_node_get_type(remote_node) ==
1471                                 IB_NODE_TYPE_SWITCH)) {
1472                                 if (!(adj_list[0].links)) {
1473                                         adj_list[0].links =
1474                                             (link_t *) malloc(sizeof(link_t));
1475                                         if (!(adj_list[0].links)) {
1476                                                 OSM_LOG(p_mgr->p_log,
1477                                                         OSM_LOG_ERROR,
1478                                                         "ERR AD07: cannot allocate memory for a link\n");
1479                                                 return 1;
1480                                         }
1481                                 }
1482                                 set_default_link(adj_list[0].links);
1483                                 adj_list[0].links->guid =
1484                                     cl_ntoh64(osm_node_get_node_guid
1485                                               (remote_node));
1486                                 adj_list[0].links->from_port =
1487                                     port->p_physp->port_num;
1488                                 adj_list[0].links->to_port = remote_port;
1489                                 adj_list[0].links->weight = 1;
1490                                 for (j = 1; j < adj_list_size; j++) {
1491                                         if (adj_list[0].links->guid ==
1492                                             adj_list[j].guid) {
1493                                                 adj_list[0].links->to = j;
1494                                                 break;
1495                                         }
1496                                 }
1497                         }
1498                 } else {
1499                         /* if link is unhealthy then there's a severe issue */
1500                         OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1501                                 "ERR AD0B: unsupported network state (CA with"
1502                                 " unhealthy link state discovered; should have"
1503                                 " been filtered out before already; gracefully"
1504                                 " shutdown this routing engine)\n");
1505                         return 1;
1506                 }
1507                 /* if behind port is a switch -> search switch in adj_list */
1508         } else {
1509                 /* reset adj_list[0], if links=NULL reset was done before, then skip */
1510                 if (adj_list[0].links) {
1511                         free(adj_list[0].links);
1512                         set_default_vertex(&adj_list[0]);
1513                 }
1514                 /* search for the switch which is the source in this round */
1515                 guid = cl_ntoh64(osm_node_get_node_guid(port->p_node));
1516                 for (i = 1; i < adj_list_size; i++) {
1517                         if (guid == adj_list[i].guid) {
1518                                 index = i;
1519                                 break;
1520                         }
1521                 }
1522         }
1523
1524         /* source in dijkstra */
1525         adj_list[index].distance = 0;
1526         adj_list[index].state = DISCOVERED;
1527         adj_list[index].hops = 0;       /* the source has hop count = 0 */
1528
1529         /* create a heap to find (efficient) the node with the smallest distance */
1530         if (osm_node_get_type(port->p_node) == IB_NODE_TYPE_CA)
1531                 err = heap_create(adj_list, adj_list_size, &heap);
1532         else
1533                 err = heap_create(&adj_list[1], adj_list_size - 1, &heap);
1534         if (err) {
1535                 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1536                         "ERR AD09: cannot allocate memory for heap or heap->node in heap_create(...)\n");
1537                 return err;
1538         }
1539
1540         current = heap_getmin(heap);
1541         while (current) {
1542                 current->state = DISCOVERED;
1543                 if (current->used_link) /* increment the number of hops to the source for each new node */
1544                         current->hops =
1545                             adj_list[current->used_link->from].hops + 1;
1546
1547                 /* add/update nodes which aren't discovered but accessible */
1548                 for (link = current->links; link != NULL; link = link->next) {
1549                         if ((adj_list[link->to].state != DISCOVERED)
1550                             && (current->distance + link->weight <
1551                                 adj_list[link->to].distance)) {
1552                                 adj_list[link->to].used_link = link;
1553                                 adj_list[link->to].distance =
1554                                     current->distance + link->weight;
1555                                 heap_heapify(heap, adj_list[link->to].heap_id);
1556                         }
1557                 }
1558
1559                 current = heap_getmin(heap);
1560         }
1561
1562         /* destroy the heap */
1563         heap_free(heap);
1564         heap = NULL;
1565
1566         OSM_LOG_EXIT(p_mgr->p_log);
1567         return 0;
1568 }
1569
1570 /* update the linear forwarding tables of all switches with the informations
1571    from the last dijsktra step
1572 */
1573 static int update_lft(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1574                       uint32_t adj_list_size, osm_port_t * p_port, uint16_t lid)
1575 {
1576         uint32_t i = 0;
1577         uint8_t port = 0;
1578         uint8_t hops = 0;
1579         osm_switch_t *p_sw = NULL;
1580         boolean_t is_ignored_by_port_prof = FALSE;
1581         osm_physp_t *p = NULL;
1582         cl_status_t ret;
1583
1584         OSM_LOG_ENTER(p_mgr->p_log);
1585
1586         for (i = 1; i < adj_list_size; i++) {
1587                 /* if no route goes thru this switch -> cycle */
1588                 if (!(adj_list[i].used_link))
1589                         continue;
1590
1591                 p_sw = adj_list[i].sw;
1592                 hops = adj_list[i].hops;
1593                 port = adj_list[i].used_link->to_port;
1594                 /* the used_link is the link that was used in dijkstra to reach this node,
1595                    so the to_port is the local port on this node
1596                  */
1597
1598                 if (port == OSM_NO_PATH) {      /* if clause shouldn't be possible in this routing, but who cares */
1599                         OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1600                                 "ERR AD06: No path to get to LID %" PRIu16
1601                                 " from switch 0x%" PRIx64 "\n", lid,
1602                                 cl_ntoh64(osm_node_get_node_guid
1603                                           (p_sw->p_node)));
1604
1605                         /* do not try to overwrite the ppro of non existing port ... */
1606                         is_ignored_by_port_prof = TRUE;
1607                         return 1;
1608                 } else {
1609                         OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1610                                 "Routing LID %" PRIu16 " to port %" PRIu8
1611                                 " for switch 0x%" PRIx64 "\n", lid, port,
1612                                 cl_ntoh64(osm_node_get_node_guid
1613                                           (p_sw->p_node)));
1614
1615                         p = osm_node_get_physp_ptr(p_sw->p_node, port);
1616                         if (!p) {
1617                                 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1618                                         "ERR AD0A: Physical port %d of Node GUID 0x%"
1619                                         PRIx64 "not found\n", port,
1620                                         cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)));
1621                                 return 1;
1622                         }
1623
1624                         /* we would like to optionally ignore this port in equalization
1625                            as in the case of the Mellanox Anafa Internal PCI TCA port
1626                          */
1627                         is_ignored_by_port_prof = p->is_prof_ignored;
1628
1629                         /* We also would ignore this route if the target lid is of
1630                            a switch and the port_profile_switch_node is not TRUE
1631                          */
1632                         if (!p_mgr->p_subn->opt.port_profile_switch_nodes)
1633                                 is_ignored_by_port_prof |=
1634                                     (osm_node_get_type(p_port->p_node) ==
1635                                      IB_NODE_TYPE_SWITCH);
1636                 }
1637
1638                 /* to support lmc > 0 the functions alloc_ports_priv, free_ports_priv, find_and_add_remote_sys
1639                    from minhop aren't needed cause osm_switch_recommend_path is implicitly calculated
1640                    for each LID pair thru dijkstra;
1641                    for each port the dijkstra algorithm calculates (max_lid_ho - min_lid_ho)-times maybe
1642                    disjoint routes to spread the bandwidth -> diffent routes for one port and lmc>0
1643                  */
1644
1645                 /* set port in LFT */
1646                 p_sw->new_lft[lid] = port;
1647                 if (!is_ignored_by_port_prof) {
1648                         /* update the number of path routing thru this port */
1649                         osm_switch_count_path(p_sw, port);
1650                 }
1651                 /* set the hop count from this switch to the lid */
1652                 ret = osm_switch_set_hops(p_sw, lid, port, hops);
1653                 if (ret != CL_SUCCESS)
1654                         OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1655                                 "ERR AD05: cannot set hops for LID %" PRIu16
1656                                 " at switch 0x%" PRIx64 "\n", lid,
1657                                 cl_ntoh64(osm_node_get_node_guid
1658                                           (p_sw->p_node)));
1659         }
1660
1661         OSM_LOG_EXIT(p_mgr->p_log);
1662         return 0;
1663 }
1664
1665 /* the function updates the multicast group membership information
1666    similar to create_mgrp_switch_map (osm_mcast_mgr.c)
1667    => with it we can identify if a switch needs to be processed
1668    or not in update_mcft
1669 */
1670 static void update_mgrp_membership(cl_qlist_t * port_list)
1671 {
1672         osm_mcast_work_obj_t *wobj = NULL;
1673         osm_port_t *port = NULL;
1674         osm_switch_t *sw = NULL;
1675         cl_list_item_t *i = NULL;
1676
1677         for (i = cl_qlist_head(port_list); i != cl_qlist_end(port_list);
1678              i = cl_qlist_next(i)) {
1679                 wobj = cl_item_obj(i, wobj, list_item);
1680                 port = wobj->p_port;
1681                 if (port->p_node->sw) {
1682                         sw = port->p_node->sw;
1683                         sw->is_mc_member = 1;
1684                 } else {
1685                         sw = port->p_physp->p_remote_physp->p_node->sw;
1686                         sw->num_of_mcm++;
1687                 }
1688         }
1689 }
1690
1691 /* reset is_mc_member and num_of_mcm for future computations */
1692 static void reset_mgrp_membership(vertex_t * adj_list, uint32_t adj_list_size)
1693 {
1694         uint32_t i = 0;
1695
1696         for (i = 1; i < adj_list_size; i++) {
1697                 if (adj_list[i].dropped)
1698                         continue;
1699
1700                 adj_list[i].sw->is_mc_member = 0;
1701                 adj_list[i].sw->num_of_mcm = 0;
1702         }
1703 }
1704
1705 /* update the multicast forwarding tables of all switches with the informations
1706    from the previous dijsktra step for the current mlid
1707 */
1708 static int update_mcft(osm_sm_t * p_sm, vertex_t * adj_list,
1709                        uint32_t adj_list_size, uint16_t mlid_ho,
1710                        cl_qmap_t * port_map, osm_switch_t * root_sw)
1711 {
1712         uint32_t i = 0;
1713         uint8_t port = 0, remote_port = 0;
1714         uint8_t upstream_port = 0, downstream_port = 0;
1715         ib_net64_t guid = 0;
1716         osm_switch_t *p_sw = NULL;
1717         osm_node_t *remote_node = NULL;
1718         osm_physp_t *p_physp = NULL;
1719         osm_mcast_tbl_t *p_tbl = NULL;
1720         vertex_t *curr_adj = NULL;
1721
1722         OSM_LOG_ENTER(p_sm->p_log);
1723
1724         for (i = 1; i < adj_list_size; i++) {
1725                 if (adj_list[i].dropped)
1726                         continue;
1727
1728                 p_sw = adj_list[i].sw;
1729                 OSM_LOG(p_sm->p_log, OSM_LOG_VERBOSE,
1730                         "Processing switch 0x%016" PRIx64
1731                         " (%s) for MLID 0x%X\n", cl_ntoh64(adj_list[i].guid),
1732                         p_sw->p_node->print_desc, mlid_ho);
1733
1734                 /* if a) the switch does not support mcast  or
1735                       b) no ports of this switch are part or the mcast group
1736                    then cycle
1737                  */
1738                 if (osm_switch_supports_mcast(p_sw) == FALSE ||
1739                     (p_sw->num_of_mcm == 0 && !(p_sw->is_mc_member)))
1740                         continue;
1741
1742                 p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
1743
1744                 /* add all ports of this sw to the mcast table,
1745                    if they are part of the mcast grp
1746                  */
1747                 if (p_sw->is_mc_member)
1748                         osm_mcast_tbl_set(p_tbl, mlid_ho, 0);
1749                 for (port = 1; port < p_sw->num_ports; port++) {
1750                         /* get the node behind the port */
1751                         remote_node =
1752                                 osm_node_get_remote_node(p_sw->p_node, port,
1753                                                          &remote_port);
1754                         /* check if connected and its not the same switch */
1755                         if (!remote_node || remote_node->sw == p_sw)
1756                                 continue;
1757                         /* make sure the link is healthy */
1758                         p_physp = osm_node_get_physp_ptr(p_sw->p_node, port);
1759                         if (!p_physp || !osm_link_is_healthy(p_physp))
1760                                 continue;
1761                         /* we don't add upstream ports in this step */
1762                         if (osm_node_get_type(remote_node) != IB_NODE_TYPE_CA)
1763                                 continue;
1764
1765                         guid = osm_physp_get_port_guid(osm_node_get_physp_ptr(
1766                                                        remote_node,
1767                                                        remote_port));
1768                         if (cl_qmap_get(port_map, guid)
1769                             != cl_qmap_end(port_map))
1770                                 osm_mcast_tbl_set(p_tbl, mlid_ho, port);
1771                 }
1772
1773                 /* now we have to add the upstream port of 'this' switch and
1774                    the downstream port of the next switch to the mcast table
1775                    until we reach the root_sw
1776                  */
1777                 curr_adj = &adj_list[i];
1778                 while (curr_adj->sw != root_sw) {
1779                         /* the used_link is the link that was used in dijkstra to reach this node,
1780                            so the to_port is the local (upstream) port on curr_adj->sw
1781                          */
1782                         upstream_port = curr_adj->used_link->to_port;
1783                         osm_mcast_tbl_set(p_tbl, mlid_ho, upstream_port);
1784
1785                         /* now we go one step in direction root_sw and add the
1786                            downstream port for the spanning tree
1787                          */
1788                         downstream_port = curr_adj->used_link->from_port;
1789                         p_tbl = osm_switch_get_mcast_tbl_ptr(
1790                                 adj_list[curr_adj->used_link->from].sw);
1791                         osm_mcast_tbl_set(p_tbl, mlid_ho, downstream_port);
1792
1793                         curr_adj = &adj_list[curr_adj->used_link->from];
1794                 }
1795         }
1796
1797         OSM_LOG_EXIT(p_sm->p_log);
1798         return 0;
1799 }
1800
1801 /* increment the edge weights of the df-/sssp graph which represent the number
1802    of paths on this link
1803 */
1804 static void update_weights(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1805                            uint32_t adj_list_size)
1806 {
1807         uint32_t i = 0, j = 0;
1808         uint32_t additional_weight = 0;
1809
1810         OSM_LOG_ENTER(p_mgr->p_log);
1811
1812         for (i = 1; i < adj_list_size; i++) {
1813                 /* if no route goes thru this switch -> cycle */
1814                 if (!(adj_list[i].used_link))
1815                         continue;
1816                 additional_weight = adj_list[i].num_hca;
1817
1818                 j = i;
1819                 while (adj_list[j].used_link) {
1820                         /* update the link from pre to this node */
1821                         adj_list[j].used_link->weight += additional_weight;
1822
1823                         j = adj_list[j].used_link->from;
1824                 }
1825         }
1826
1827         OSM_LOG_EXIT(p_mgr->p_log);
1828 }
1829
1830 /* get the largest number of virtual lanes which is supported by all switches
1831    in the subnet
1832 */
1833 static uint8_t get_avail_vl_in_subn(osm_ucast_mgr_t * p_mgr)
1834 {
1835         uint32_t i = 0;
1836         uint8_t vls_avail = 0xFF, port_vls_avail = 0;
1837         cl_qmap_t *sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
1838         cl_map_item_t *item = NULL;
1839         osm_switch_t *sw = NULL;
1840
1841         /* traverse all switches to get the number of available virtual lanes in the subnet */
1842         for (item = cl_qmap_head(sw_tbl); item != cl_qmap_end(sw_tbl);
1843              item = cl_qmap_next(item)) {
1844                 sw = (osm_switch_t *) item;
1845
1846                 /* ignore management port 0 */
1847                 for (i = 1; i < osm_node_get_num_physp(sw->p_node); i++) {
1848                         osm_physp_t *p_physp =
1849                             osm_node_get_physp_ptr(sw->p_node, i);
1850
1851                         if (p_physp && p_physp->p_remote_physp) {
1852                                 port_vls_avail =
1853                                     ib_port_info_get_op_vls(&p_physp->
1854                                                             port_info);
1855                                 if (port_vls_avail
1856                                     && port_vls_avail < vls_avail)
1857                                         vls_avail = port_vls_avail;
1858                         }
1859                 }
1860         }
1861
1862         /* ib_port_info_get_op_vls gives values 1 ... 5 (s. IBAS 14.2.5.6) */
1863         vls_avail = 1 << (vls_avail - 1);
1864
1865         /* set boundaries (s. IBAS 3.5.7) */
1866         if (vls_avail > 15)
1867                 vls_avail = 15;
1868         if (vls_avail < 1)
1869                 vls_avail = 1;
1870
1871         return vls_avail;
1872 }
1873
1874 /* search for cycles in the channel dependency graph to identify possible
1875    deadlocks in the network;
1876    assign new virtual lanes to some paths to break the deadlocks
1877 */
1878 static int dfsssp_remove_deadlocks(dfsssp_context_t * dfsssp_ctx)
1879 {
1880         osm_ucast_mgr_t *p_mgr = (osm_ucast_mgr_t *) dfsssp_ctx->p_mgr;
1881
1882         cl_qlist_t *port_tbl = &p_mgr->port_order_list; /* 1 management port per switch + 1 or 2 ports for each Hca */
1883         cl_list_item_t *item1 = NULL, *item2 = NULL;
1884         osm_port_t *src_port = NULL, *dest_port = NULL;
1885
1886         uint32_t i = 0, j = 0, err = 0;
1887         uint8_t vl = 0, test_vl = 0, vl_avail = 0, vl_needed = 1;
1888         double most_avg_paths = 0.0;
1889         cdg_node_t **cdg = NULL, *start_here = NULL, *cycle = NULL;
1890         cdg_link_t *weakest_link = NULL;
1891         uint32_t srcdest = 0;
1892
1893         vltable_t *srcdest2vl_table = NULL;
1894         uint8_t lmc = 0;
1895         uint16_t slid = 0, dlid = 0, min_lid_ho = 0, max_lid_ho =
1896             0, min_lid_ho2 = 0, max_lid_ho2 = 0;;
1897         uint64_t *paths_per_vl = NULL;
1898         uint64_t from = 0, to = 0, count = 0;
1899         uint8_t *split_count = NULL;
1900         uint8_t ntype = 0;
1901
1902         OSM_LOG_ENTER(p_mgr->p_log);
1903         OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
1904                 "Assign each src/dest pair a Virtual Lanes, to remove deadlocks in the routing\n");
1905
1906         vl_avail = get_avail_vl_in_subn(p_mgr);
1907         OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
1908                 "Virtual Lanes available: %" PRIu8 "\n", vl_avail);
1909
1910         paths_per_vl = (uint64_t *) malloc(vl_avail * sizeof(uint64_t));
1911         if (!paths_per_vl) {
1912                 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1913                         "ERR AD22: cannot allocate memory for paths_per_vl\n");
1914                 return 1;
1915         }
1916         memset(paths_per_vl, 0, vl_avail * sizeof(uint64_t));
1917
1918         cdg = (cdg_node_t **) malloc(vl_avail * sizeof(cdg_node_t *));
1919         if (!cdg) {
1920                 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1921                         "ERR AD23: cannot allocate memory for cdg\n");
1922                 free(paths_per_vl);
1923                 return 1;
1924         }
1925         for (i = 0; i < vl_avail; i++)
1926                 cdg[i] = NULL;
1927
1928         count = 0;
1929         /* count all ports (also multiple LIDs) of type CA or SP0 for size of VL table */
1930         for (item1 = cl_qlist_head(port_tbl); item1 != cl_qlist_end(port_tbl);
1931              item1 = cl_qlist_next(item1)) {
1932                 dest_port = (osm_port_t *)cl_item_obj(item1, dest_port,
1933                                                       list_item);
1934                 ntype = osm_node_get_type(dest_port->p_node);
1935                 if (ntype == IB_NODE_TYPE_CA || ntype == IB_NODE_TYPE_SWITCH) {
1936                         /* only SP0 with SLtoVLMapping support will be processed */
1937                         if (ntype == IB_NODE_TYPE_SWITCH
1938                             && !(dest_port->p_physp->port_info.capability_mask
1939                             & IB_PORT_CAP_HAS_SL_MAP))
1940                                 continue;
1941
1942                         lmc = osm_port_get_lmc(dest_port);
1943                         count += (1 << lmc);
1944                 }
1945         }
1946         /* allocate VL table and indexing array */
1947         err = vltable_alloc(&srcdest2vl_table, count);
1948         if (err) {
1949                 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1950                         "ERR AD26: cannot allocate memory for srcdest2vl_table\n");
1951                 goto ERROR;
1952         }
1953
1954         i = 0;
1955         /* fill lids into indexing array */
1956         for (item1 = cl_qlist_head(port_tbl); item1 != cl_qlist_end(port_tbl);
1957              item1 = cl_qlist_next(item1)) {
1958                 dest_port = (osm_port_t *)cl_item_obj(item1, dest_port,
1959                                                       list_item);
1960                 ntype = osm_node_get_type(dest_port->p_node);
1961                 if (ntype == IB_NODE_TYPE_CA || ntype == IB_NODE_TYPE_SWITCH) {
1962                         /* only SP0 with SLtoVLMapping support will be processed */
1963                         if (ntype == IB_NODE_TYPE_SWITCH
1964                             && !(dest_port->p_physp->port_info.capability_mask
1965                             & IB_PORT_CAP_HAS_SL_MAP))
1966                                 continue;
1967
1968                         osm_port_get_lid_range_ho(dest_port, &min_lid_ho,
1969                                                   &max_lid_ho);
1970                         for (dlid = min_lid_ho; dlid <= max_lid_ho; dlid++, i++)
1971                                 srcdest2vl_table->lids[i] = cl_hton16(dlid);
1972                 }
1973         }
1974         /* sort lids */
1975         vltable_sort_lids(srcdest2vl_table);
1976
1977         test_vl = 0;
1978         /* fill cdg[0] with routes from each src/dest port combination for all Hca/SP0 in the subnet */
1979         for (item1 = cl_qlist_head(port_tbl); item1 != cl_qlist_end(port_tbl);
1980              item1 = cl_qlist_next(item1)) {
1981                 dest_port = (osm_port_t *)cl_item_obj(item1, dest_port,
1982                                                       list_item);
1983                 ntype = osm_node_get_type(dest_port->p_node);
1984                 if ((ntype != IB_NODE_TYPE_CA && ntype != IB_NODE_TYPE_SWITCH)
1985                     || !(dest_port->p_physp->port_info.capability_mask
1986                     & IB_PORT_CAP_HAS_SL_MAP))
1987                         continue;
1988
1989                 for (item2 = cl_qlist_head(port_tbl);
1990                      item2 != cl_qlist_end(port_tbl);
1991                      item2 = cl_qlist_next(item2)) {
1992                         src_port = (osm_port_t *)cl_item_obj(item2, src_port,
1993                                                              list_item);
1994                         ntype = osm_node_get_type(src_port->p_node);
1995                         if ((ntype != IB_NODE_TYPE_CA
1996                             && ntype != IB_NODE_TYPE_SWITCH)
1997                             || !(src_port->p_physp->port_info.capability_mask
1998                             & IB_PORT_CAP_HAS_SL_MAP))
1999                                 continue;
2000
2001                         if (src_port != dest_port) {
2002                                 /* iterate over LIDs of src and dest port */
2003                                 osm_port_get_lid_range_ho(src_port, &min_lid_ho,
2004                                                           &max_lid_ho);
2005                                 for (slid = min_lid_ho; slid <= max_lid_ho;
2006                                      slid++) {
2007                                         osm_port_get_lid_range_ho
2008                                             (dest_port, &min_lid_ho2,
2009                                              &max_lid_ho2);
2010                                         for (dlid = min_lid_ho2;
2011                                              dlid <= max_lid_ho2;
2012                                              dlid++) {
2013
2014                                                 /* try to add the path to cdg[0] */
2015                                                 err =
2016                                                     update_channel_dep_graph
2017                                                     (&(cdg[test_vl]),
2018                                                      src_port, slid,
2019                                                      dest_port, dlid);
2020                                                 if (err) {
2021                                                         OSM_LOG(p_mgr->
2022                                                                 p_log,
2023                                                                 OSM_LOG_ERROR,
2024                                                                 "ERR AD14: cannot allocate memory for cdg node or link in update_channel_dep_graph(...)\n");
2025                                                         goto ERROR;
2026                                                 }
2027                                                 /* add the <s,d> combination / corresponding virtual lane to the VL table */
2028                                                 vltable_insert
2029                                                     (srcdest2vl_table,
2030                                                      cl_hton16(slid),
2031                                                      cl_hton16(dlid),
2032                                                      test_vl);
2033                                                 paths_per_vl[test_vl]++;
2034
2035                                         }
2036
2037                                 }
2038                         }
2039
2040                 }
2041         }
2042         dfsssp_ctx->srcdest2vl_table = srcdest2vl_table;
2043
2044         /* test all cdg for cycles and break the cycles by moving paths on the weakest link to the next cdg */
2045         for (test_vl = 0; test_vl < vl_avail - 1; test_vl++) {
2046                 start_here = cdg[test_vl];
2047                 while (start_here) {
2048                         cycle =
2049                             search_cycle_in_channel_dep_graph(cdg[test_vl],
2050                                                               start_here);
2051
2052                         if (cycle) {
2053                                 vl_needed = test_vl + 2;
2054
2055                                 /* calc weakest link n cycle */
2056                                 weakest_link = get_weakest_link_in_cycle(cycle);
2057                                 if (!weakest_link) {
2058                                         OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2059                                                 "ERR AD27: something went wrong in get_weakest_link_in_cycle(...)\n");
2060                                         err = 1;
2061                                         goto ERROR;
2062                                 }
2063
2064                                 paths_per_vl[test_vl] -=
2065                                     weakest_link->num_pairs;
2066                                 paths_per_vl[test_vl + 1] +=
2067                                     weakest_link->num_pairs;
2068
2069                                 /* move all <s,d> paths on this link to the next cdg */
2070                                 for (i = 0; i < weakest_link->num_pairs; i++) {
2071                                         srcdest =
2072                                             get_next_srcdest_pair(weakest_link,
2073                                                                   i);
2074                                         slid = (uint16_t) (srcdest >> 16);
2075                                         dlid =
2076                                             (uint16_t) ((srcdest << 16) >> 16);
2077
2078                                         /* only move if not moved in a previous step */
2079                                         if (test_vl !=
2080                                             (uint8_t)
2081                                             vltable_get_vl(srcdest2vl_table,
2082                                                            cl_hton16(slid),
2083                                                            cl_hton16(dlid))) {
2084                                                 /* this path has been moved
2085                                                    before -> don't count
2086                                                  */
2087                                                 paths_per_vl[test_vl]++;
2088                                                 paths_per_vl[test_vl + 1]--;
2089                                                 continue;
2090                                         }
2091
2092                                         src_port =
2093                                             osm_get_port_by_lid(p_mgr->p_subn,
2094                                                                 cl_hton16
2095                                                                 (slid));
2096                                         dest_port =
2097                                             osm_get_port_by_lid(p_mgr->p_subn,
2098                                                                 cl_hton16
2099                                                                 (dlid));
2100
2101                                         /* remove path from current cdg / vl */
2102                                         err =
2103                                             remove_path_from_cdg(&
2104                                                                  (cdg[test_vl]),
2105                                                                  src_port, slid,
2106                                                                  dest_port,
2107                                                                  dlid);
2108                                         if (err) {
2109                                                 OSM_LOG(p_mgr->p_log,
2110                                                         OSM_LOG_ERROR,
2111                                                         "ERR AD44: something went wrong in remove_path_from_cdg(...)\n");
2112                                                 goto ERROR;
2113                                         }
2114
2115                                         /* add path to next cdg / vl */
2116                                         err =
2117                                             update_channel_dep_graph(&
2118                                                                      (cdg
2119                                                                       [test_vl +
2120                                                                        1]),
2121                                                                      src_port,
2122                                                                      slid,
2123                                                                      dest_port,
2124                                                                      dlid);
2125                                         if (err) {
2126                                                 OSM_LOG(p_mgr->p_log,
2127                                                         OSM_LOG_ERROR,
2128                                                         "ERR AD14: cannot allocate memory for cdg node or link in update_channel_dep_graph(...)\n");
2129                                                 goto ERROR;
2130                                         }
2131                                         vltable_insert(srcdest2vl_table,
2132                                                        cl_hton16(slid),
2133                                                        cl_hton16(dlid),
2134                                                        test_vl + 1);
2135                                 }
2136
2137                                 if (weakest_link->num_pairs)
2138                                         free(weakest_link->srcdest_pairs);
2139                                 if (weakest_link)
2140                                         free(weakest_link);
2141                         }
2142
2143                         start_here = cycle;
2144                 }
2145         }
2146
2147         /* test the last avail cdg for a cycle;
2148            if there is one, than vl_needed > vl_avail
2149          */
2150         start_here = cdg[vl_avail - 1];
2151         if (start_here) {
2152                 cycle =
2153                     search_cycle_in_channel_dep_graph(cdg[vl_avail - 1],
2154                                                       start_here);
2155                 if (cycle) {
2156                         vl_needed = vl_avail + 1;
2157                 }
2158         }
2159
2160         OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2161                 "Virtual Lanes needed: %" PRIu8 "\n", vl_needed);
2162         if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_INFO)) {
2163                 OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2164                         "Paths per VL (before balancing):\n");
2165                 for (i = 0; i < vl_avail; i++)
2166                         OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2167                                 "   %" PRIu32 ". lane: %" PRIu64 "\n", i,
2168                                 paths_per_vl[i]);
2169         }
2170
2171         OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
2172                 "Balancing the paths on the available Virtual Lanes\n");
2173
2174         /* optimal balancing virtual lanes, under condition: no additional cycle checks;
2175            sl/vl != 0 might be assigned to loopback packets (i.e. slid/dlid on the
2176            same port for lmc>0), but thats no problem, see IBAS 10.2.2.3
2177          */
2178         split_count = (uint8_t *) calloc(vl_avail, sizeof(uint8_t));
2179         if (!split_count) {
2180                 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2181                         "ERR AD24: cannot allocate memory for split_count, skip balancing\n");
2182                 err = 1;
2183                 goto ERROR;
2184         }
2185         /* initial state: paths for VLs won't be separated */
2186         for (i = 0; i < ((vl_needed < vl_avail) ? vl_needed : vl_avail); i++)
2187                 split_count[i] = 1;
2188         dfsssp_ctx->vl_split_count = split_count;
2189         /* balancing is necessary if we have empty VLs */
2190         if (vl_needed < vl_avail) {
2191                 /* split paths of VLs until we find an equal distribution */
2192                 for (i = vl_needed; i < vl_avail; i++) {
2193                         /* find VL with most paths in it */
2194                         vl = 0;
2195                         most_avg_paths = 0.0;
2196                         for (test_vl = 0; test_vl < vl_needed; test_vl++) {
2197                                 if (most_avg_paths <
2198                                     ((double)paths_per_vl[test_vl] /
2199                                     split_count[test_vl])) {
2200                                         vl = test_vl;
2201                                         most_avg_paths =
2202                                                 (double)paths_per_vl[test_vl] /
2203                                                 split_count[test_vl];
2204                                 }
2205                         }
2206                         split_count[vl]++;
2207                 }
2208                 /* change the VL assignment depending on split_count for
2209                    all VLs except VL 0
2210                  */
2211                 for (from = vl_needed - 1; from > 0; from--) {
2212                         /* how much space needed for others? */
2213                         to = 0;
2214                         for (i = 0; i < from; i++)
2215                                 to += split_count[i];
2216                         count = paths_per_vl[from];
2217                         vltable_change_vl(srcdest2vl_table, from, to, count);
2218                         /* change also the information within the split_count
2219                            array; this is important for fast calculation later
2220                          */
2221                         split_count[to] = split_count[from];
2222                         split_count[from] = 0;
2223                         paths_per_vl[to] = paths_per_vl[from];
2224                         paths_per_vl[from] = 0;
2225                 }
2226         } else if (vl_needed > vl_avail) {
2227                 /* routing not possible, a further development would be the LASH-TOR approach (update: LASH-TOR isn't possible, there is a mistake in the theory) */
2228                 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2229                         "ERR AD25: Not enough VLs available (avail=%d, needed=%d); Stopping dfsssp routing!\n",
2230                         vl_avail, vl_needed);
2231                 err = 1;
2232                 goto ERROR;
2233         }
2234         /* else { no balancing } */
2235
2236         if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG)) {
2237                 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2238                         "Virtual Lanes per src/dest combination after balancing:\n");
2239                 vltable_print(p_mgr, srcdest2vl_table);
2240         }
2241         if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_INFO)) {
2242                 OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2243                         "Approx. #paths per VL (after balancing):\n");
2244                 j = 0;
2245                 count = 1; /* to prevent div. by 0 */
2246                 for (i = 0; i < vl_avail; i++) {
2247                         if (split_count[i] > 0) {
2248                                 j = i;
2249                                 count = split_count[i];
2250                         }
2251                         OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2252                                 "   %" PRIu32 ". lane: %" PRIu64 "\n", i,
2253                                 paths_per_vl[j] / count);
2254                 }
2255         }
2256
2257         free(paths_per_vl);
2258
2259         /* deallocate channel dependency graphs */
2260         for (i = 0; i < vl_avail; i++)
2261                 cdg_dealloc(&cdg[i]);
2262         free(cdg);
2263
2264         OSM_LOG_EXIT(p_mgr->p_log);
2265         return 0;
2266
2267 ERROR:
2268         free(paths_per_vl);
2269
2270         for (i = 0; i < vl_avail; i++)
2271                 cdg_dealloc(&cdg[i]);
2272         free(cdg);
2273
2274         vltable_dealloc(&srcdest2vl_table);
2275         dfsssp_ctx->srcdest2vl_table = NULL;
2276
2277         return err;
2278 }
2279
2280 /* meta function which calls subfunctions for dijkstra, update lft and weights,
2281    (and remove deadlocks) to calculate the routing for the subnet
2282 */
2283 static int dfsssp_do_dijkstra_routing(void *context)
2284 {
2285         dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
2286         osm_ucast_mgr_t *p_mgr = (osm_ucast_mgr_t *) dfsssp_ctx->p_mgr;
2287         vertex_t *adj_list = (vertex_t *) dfsssp_ctx->adj_list;
2288         uint32_t adj_list_size = dfsssp_ctx->adj_list_size;
2289
2290         vertex_t **sw_list = NULL;
2291         uint32_t sw_list_size = 0;
2292         uint64_t guid = 0;
2293         cl_qlist_t *qlist = NULL;
2294         cl_list_item_t *qlist_item = NULL;
2295
2296         cl_qmap_t *sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
2297         cl_qmap_t cn_tbl, io_tbl, *p_mixed_tbl = NULL;
2298         cl_map_item_t *item = NULL;
2299         osm_switch_t *sw = NULL;
2300         osm_port_t *port = NULL;
2301         uint32_t i = 0, err = 0;
2302         uint16_t lid = 0, min_lid_ho = 0, max_lid_ho = 0;
2303         uint8_t lmc = 0;
2304         boolean_t cn_nodes_provided = FALSE, io_nodes_provided = FALSE;
2305
2306         OSM_LOG_ENTER(p_mgr->p_log);
2307         OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
2308                 "Calculating shortest path from all Hca/switches to all\n");
2309
2310         cl_qmap_init(&cn_tbl);
2311         cl_qmap_init(&io_tbl);
2312         p_mixed_tbl = &cn_tbl;
2313
2314         cl_qlist_init(&p_mgr->port_order_list);
2315
2316         /* reset the new_lft for each switch */
2317         for (item = cl_qmap_head(sw_tbl); item != cl_qmap_end(sw_tbl);
2318              item = cl_qmap_next(item)) {
2319                 sw = (osm_switch_t *) item;
2320                 /* initialize LIDs in buffer to invalid port number */
2321                 memset(sw->new_lft, OSM_NO_PATH, sw->max_lid_ho + 1);
2322                 /* initialize LFT and hop count for bsp0/esp0 of the switch */
2323                 min_lid_ho = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0));
2324                 lmc = osm_node_get_lmc(sw->p_node, 0);
2325                 for (i = min_lid_ho; i < min_lid_ho + (1 << lmc); i++) {
2326                         /* for each switch the port to the 'self'lid is the management port 0 */
2327                         sw->new_lft[i] = 0;
2328                         /* the hop count to the 'self'lid is 0 for each switch */
2329                         osm_switch_set_hops(sw, i, 0, 0);
2330                 }
2331         }
2332
2333         /* we need an intermediate array of pointers to switches in adj_list;
2334            this array will be sorted in respect to num_hca (descending)
2335          */
2336         sw_list_size = adj_list_size - 1;
2337         sw_list = (vertex_t **)malloc(sw_list_size * sizeof(vertex_t *));
2338         if (!sw_list) {
2339                 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2340                         "ERR AD29: cannot allocate memory for sw_list in dfsssp_do_dijkstra_routing\n");
2341                 goto ERROR;
2342         }
2343         memset(sw_list, 0, sw_list_size * sizeof(vertex_t *));
2344
2345         /* fill the array with references to the 'real' sw in adj_list */
2346         for (i = 0; i < sw_list_size; i++)
2347                 sw_list[i] = &(adj_list[i + 1]);
2348
2349         /* sort the sw_list in descending order */
2350         sw_list_sort_by_num_hca(sw_list, sw_list_size);
2351
2352         /* parse compute node guid file, if provided by the user */
2353         if (p_mgr->p_subn->opt.cn_guid_file) {
2354                 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2355                         "Parsing compute nodes from file %s\n",
2356                         p_mgr->p_subn->opt.cn_guid_file);
2357
2358                 if (parse_node_map(p_mgr->p_subn->opt.cn_guid_file,
2359                                    add_guid_to_map, &cn_tbl)) {
2360                         OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2361                                 "ERR AD33: Problem parsing compute node guid file\n");
2362                         goto ERROR;
2363                 }
2364
2365                 if (cl_is_qmap_empty(&cn_tbl))
2366                         OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2367                                 "WRN AD34: compute node guids file contains no valid guids\n");
2368                 else
2369                         cn_nodes_provided = TRUE;
2370         }
2371
2372         /* parse I/O guid file, if provided by the user */
2373         if (p_mgr->p_subn->opt.io_guid_file) {
2374                 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2375                         "Parsing I/O nodes from file %s\n",
2376                         p_mgr->p_subn->opt.io_guid_file);
2377
2378                 if (parse_node_map(p_mgr->p_subn->opt.io_guid_file,
2379                                    add_guid_to_map, &io_tbl)) {
2380                         OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2381                                 "ERR AD35: Problem parsing I/O guid file\n");
2382                         goto ERROR;
2383                 }
2384
2385                 if (cl_is_qmap_empty(&io_tbl))
2386                         OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2387                                 "WRN AD36: I/O node guids file contains no valid guids\n");
2388                 else
2389                         io_nodes_provided = TRUE;
2390         }
2391
2392         /* if we mix Hca/Tca/SP0 during the dijkstra routing, we might end up
2393            in rare cases with a bad balancing for Hca<->Hca connections, i.e.
2394            some inter-switch links get oversubscribed with paths;
2395            therefore: add Hca ports first to ensure good Hca<->Hca balancing
2396          */
2397         if (cn_nodes_provided) {
2398                 for (i = 0; i < adj_list_size - 1; i++) {
2399                         if (sw_list[i] && sw_list[i]->sw) {
2400                                 sw = (osm_switch_t *)(sw_list[i]->sw);
2401                                 add_sw_endports_to_order_list(sw, p_mgr,
2402                                                               &cn_tbl, TRUE);
2403                         } else {
2404                                 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2405                                         "ERR AD30: corrupted sw_list array in dfsssp_do_dijkstra_routing\n");
2406                                 goto ERROR;
2407                         }
2408                 }
2409         }
2410         /* then: add Tca ports to ensure good Hca->Tca balancing and separate
2411            paths towards I/O nodes on the same switch (if possible)
2412          */
2413         if (io_nodes_provided) {
2414                 for (i = 0; i < adj_list_size - 1; i++) {
2415                         if (sw_list[i] && sw_list[i]->sw) {
2416                                 sw = (osm_switch_t *)(sw_list[i]->sw);
2417                                 add_sw_endports_to_order_list(sw, p_mgr,
2418                                                               &io_tbl, TRUE);
2419                         } else {
2420                                 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2421                                         "ERR AD32: corrupted sw_list array in dfsssp_do_dijkstra_routing\n");
2422                                 goto ERROR;
2423                         }
2424                 }
2425         }
2426         /* then: add anything else, such as administration nodes, ... */
2427         if (cn_nodes_provided && io_nodes_provided) {
2428                 cl_qmap_merge(&cn_tbl, &io_tbl);
2429         } else if (io_nodes_provided) {
2430                 p_mixed_tbl = &io_tbl;
2431         }
2432         for (i = 0; i < adj_list_size - 1; i++) {
2433                 if (sw_list[i] && sw_list[i]->sw) {
2434                         sw = (osm_switch_t *)(sw_list[i]->sw);
2435                         add_sw_endports_to_order_list(sw, p_mgr, p_mixed_tbl,
2436                                                       FALSE);
2437                 } else {
2438                         OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2439                                 "ERR AD39: corrupted sw_list array in dfsssp_do_dijkstra_routing\n");
2440                         goto ERROR;
2441                 }
2442         }
2443         /* last: add SP0 afterwards which have lower priority for balancing */
2444         for (i = 0; i < sw_list_size; i++) {
2445                 if (sw_list[i] && sw_list[i]->sw) {
2446                         sw = (osm_switch_t *)(sw_list[i]->sw);
2447                         guid = cl_ntoh64(osm_node_get_node_guid(sw->p_node));
2448                         add_guid_to_order_list(guid, p_mgr);
2449                 } else {
2450                         OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2451                                 "ERR AD31: corrupted sw_list array in dfsssp_do_dijkstra_routing\n");
2452                         goto ERROR;
2453                 }
2454         }
2455
2456         /* the intermediate array lived long enough */
2457         free(sw_list);
2458         sw_list = NULL;
2459         /* same is true for the compute node and I/O guid map */
2460         destroy_guid_map(&cn_tbl);
2461         cn_nodes_provided = FALSE;
2462         destroy_guid_map(&io_tbl);
2463         io_nodes_provided = FALSE;
2464
2465         /* do the routing for the each Hca in the subnet and each switch
2466            in the subnet (to add the routes to base/enhanced SP0)
2467          */
2468         qlist = &p_mgr->port_order_list;
2469         for (qlist_item = cl_qlist_head(qlist);
2470              qlist_item != cl_qlist_end(qlist);
2471              qlist_item = cl_qlist_next(qlist_item)) {
2472                 port = (osm_port_t *)cl_item_obj(qlist_item, port, list_item);
2473
2474                 /* calculate shortest path with dijkstra from node to all switches/Hca */
2475                 if (osm_node_get_type(port->p_node) == IB_NODE_TYPE_CA) {
2476                         OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2477                                 "Processing Hca with GUID 0x%" PRIx64 "\n",
2478                                 cl_ntoh64(osm_node_get_node_guid
2479                                           (port->p_node)));
2480                 } else if (osm_node_get_type(port->p_node) == IB_NODE_TYPE_SWITCH) {
2481                         OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2482                                 "Processing switch with GUID 0x%" PRIx64 "\n",
2483                                 cl_ntoh64(osm_node_get_node_guid
2484                                           (port->p_node)));
2485                 } else {
2486                         /* we don't handle routers, in case they show up */
2487                         continue;
2488                 }
2489
2490                 /* distribute the LID range across the ports that can reach those LIDs
2491                    to have disjoint paths for one destination port with lmc>0;
2492                    for switches with bsp0: min=max; with esp0: max>min if lmc>0
2493                  */
2494                 osm_port_get_lid_range_ho(port, &min_lid_ho,
2495                                           &max_lid_ho);
2496                 for (lid = min_lid_ho; lid <= max_lid_ho; lid++) {
2497                         /* do dijkstra from this Hca/LID/SP0 to each switch */
2498                         err =
2499                             dijkstra(p_mgr, adj_list, adj_list_size, port, lid);
2500                         if (err)
2501                                 goto ERROR;
2502                         if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG))
2503                                 print_routes(p_mgr, adj_list, adj_list_size,
2504                                              port);
2505
2506                         /* make an update for the linear forwarding tables of the switches */
2507                         err =
2508                             update_lft(p_mgr, adj_list, adj_list_size, port, lid);
2509                         if (err)
2510                                 goto ERROR;
2511
2512                         /* add weights for calculated routes to adjust the weights for the next cycle */
2513                         update_weights(p_mgr, adj_list, adj_list_size);
2514
2515                         if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG))
2516                                 dfsssp_print_graph(p_mgr, adj_list,
2517                                                    adj_list_size);
2518                 }
2519         }
2520
2521         /* try deadlock removal only for the dfsssp routing (not for the sssp case, which is a subset of the dfsssp algorithm) */
2522         if (dfsssp_ctx->routing_type == OSM_ROUTING_ENGINE_TYPE_DFSSSP) {
2523                 /* remove potential deadlocks by assigning different virtual lanes to src/dest paths and balance the lanes */
2524                 err = dfsssp_remove_deadlocks(dfsssp_ctx);
2525                 if (err)
2526                         goto ERROR;
2527         } else if (dfsssp_ctx->routing_type == OSM_ROUTING_ENGINE_TYPE_SSSP) {
2528                 OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2529                         "SSSP routing specified -> skipping deadlock removal thru dfsssp_remove_deadlocks(...)\n");
2530         } else {
2531                 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2532                         "ERR AD28: wrong routing engine specified in dfsssp_ctx\n");
2533                 goto ERROR;
2534         }
2535
2536         /* list not needed after the dijkstra steps and deadlock removal */
2537         cl_qlist_remove_all(&p_mgr->port_order_list);
2538
2539         /* print the new_lft for each switch after routing is done */
2540         if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG)) {
2541                 for (item = cl_qmap_head(sw_tbl); item != cl_qmap_end(sw_tbl);
2542                      item = cl_qmap_next(item)) {
2543                         sw = (osm_switch_t *) item;
2544                         OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2545                                 "Summary of the (new) LFT for switch 0x%" PRIx64
2546                                 " (%s):\n",
2547                                 cl_ntoh64(osm_node_get_node_guid(sw->p_node)),
2548                                 sw->p_node->print_desc);
2549                         for (i = 0; i < sw->max_lid_ho + 1; i++)
2550                                 if (sw->new_lft[i] != OSM_NO_PATH) {
2551                                         OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2552                                                 "   for LID=%" PRIu32
2553                                                 " use port=%" PRIu8 "\n", i,
2554                                                 sw->new_lft[i]);
2555                                 }
2556                 }
2557         }
2558
2559         OSM_LOG_EXIT(p_mgr->p_log);
2560         return 0;
2561
2562 ERROR:
2563         if (!cl_is_qlist_empty(&p_mgr->port_order_list))
2564                 cl_qlist_remove_all(&p_mgr->port_order_list);
2565         if (cn_nodes_provided)
2566                 destroy_guid_map(&cn_tbl);
2567         if (io_nodes_provided)
2568                 destroy_guid_map(&io_tbl);
2569         if (sw_list)
2570                 free(sw_list);
2571         return -1;
2572 }
2573
2574 /* meta function which calls subfunctions for finding the optimal switch
2575    for the spanning tree, performing a dijkstra step with this sw as root,
2576    and calculating the mcast table for MLID
2577 */
2578 static ib_api_status_t dfsssp_do_mcast_routing(void * context,
2579                                                osm_mgrp_box_t * mbox)
2580 {
2581         dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
2582         osm_ucast_mgr_t *p_mgr = (osm_ucast_mgr_t *) dfsssp_ctx->p_mgr;
2583         osm_sm_t *sm = (osm_sm_t *) p_mgr->sm;
2584         vertex_t *adj_list = (vertex_t *) dfsssp_ctx->adj_list;
2585         uint32_t adj_list_size = dfsssp_ctx->adj_list_size;
2586         cl_qlist_t mcastgrp_port_list;
2587         cl_qmap_t mcastgrp_port_map;
2588         osm_switch_t *root_sw = NULL, *p_sw = NULL;
2589         osm_port_t *port = NULL;
2590         ib_net16_t lid = 0;
2591         uint32_t err = 0, num_ports = 0, i = 0;
2592         ib_net64_t guid = 0;
2593         ib_api_status_t status = IB_SUCCESS;
2594
2595         OSM_LOG_ENTER(sm->p_log);
2596
2597         /* using the ucast cache feature with dfsssp might mean that a leaf sw
2598            got removed (and got back) without calling dfsssp_build_graph
2599            and therefore the adj_list (and pointers to osm's internal switches)
2600            could be outdated (here we have no knowledge if it has happened, so
2601            unfortunately a check is necessary... still better than rebuilding
2602            adj_list every time we arrive here)
2603          */
2604         if (p_mgr->p_subn->opt.use_ucast_cache && p_mgr->cache_valid) {
2605                 for (i = 1; i < adj_list_size; i++) {
2606                         guid = cl_hton64(adj_list[i].guid);
2607                         p_sw = osm_get_switch_by_guid(p_mgr->p_subn, guid);
2608                         if (p_sw) {
2609                                 /* check if switch came back from the dead */
2610                                 if (adj_list[i].dropped)
2611                                         adj_list[i].dropped = FALSE;
2612
2613                                 /* verify that sw object has not been moved
2614                                    (this can happen for a leaf switch, if it
2615                                    was dropped and came back later without a
2616                                    rerouting), otherwise we have to update
2617                                    dfsssp's internal switch list with the new
2618                                    sw pointer
2619                                  */
2620                                 if (p_sw == adj_list[i].sw)
2621                                         continue;
2622                                 else
2623                                         adj_list[i].sw = p_sw;
2624                         } else {
2625                                 /* if a switch from adj_list is not in the
2626                                    sw_guid_tbl anymore, then the only reason is
2627                                    that it was a leaf switch and opensm dropped
2628                                    it without calling a rerouting
2629                                    -> calling dijkstra is no problem, since it
2630                                       is a leaf and different from root_sw
2631                                    -> only update_mcft and reset_mgrp_membership
2632                                       need to be aware of these dropped switches
2633                                  */
2634                                 if (!adj_list[i].dropped)
2635                                         adj_list[i].dropped = TRUE;
2636                         }
2637                 }
2638         }
2639
2640         /* create a map and a list of all ports which are member in the mcast
2641            group; map for searching elements and list for iteration
2642          */
2643         if (osm_mcast_make_port_list_and_map(&mcastgrp_port_list,
2644                                              &mcastgrp_port_map, mbox)) {
2645                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR AD50: "
2646                         "Insufficient memory to make port list\n");
2647                 status = IB_ERROR;
2648                 goto Exit;
2649         }
2650
2651         num_ports = cl_qlist_count(&mcastgrp_port_list);
2652         if (num_ports < 2) {
2653                 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
2654                         "MLID 0x%X has %u members - nothing to do\n",
2655                         mbox->mlid, num_ports);
2656                 goto Exit;
2657         }
2658
2659         /* find the root switch for the spanning tree, which has the smallest
2660            hops count to all LIDs in the mcast group
2661          */
2662         root_sw = osm_mcast_mgr_find_root_switch(sm, &mcastgrp_port_list);
2663         if (!root_sw) {
2664                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR AD51: "
2665                         "Unable to locate a suitable switch for group 0x%X\n",
2666                         mbox->mlid);
2667                 status = IB_ERROR;
2668                 goto Exit;
2669         }
2670
2671         /* a) start one dijkstra step from the root switch to generate a
2672            spanning tree
2673            b) this might be a bit of an overkill to span the whole
2674            network, if there are only a few ports in the mcast group, but
2675            its only one dijkstra step for each mcast group and we did many
2676            steps before in the ucast routing for each LID in the subnet;
2677            c) we can use the subnet structure from the ucast routing, and
2678            don't even have to reset the link weights (=> therefore the mcast
2679            spanning tree will use less 'growded' links in the network)
2680            d) the mcast dfsssp algorithm will not change the link weights
2681          */
2682         lid = osm_node_get_base_lid(root_sw->p_node, 0);
2683         port = osm_get_port_by_lid(sm->p_subn, lid);
2684         err = dijkstra(p_mgr, adj_list, adj_list_size, port, lid);
2685         if (err) {
2686                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR AD52: "
2687                         "Dijkstra step for mcast failed for group 0x%X\n",
2688                         mbox->mlid);
2689                 status = IB_ERROR;
2690                 goto Exit;
2691         }
2692
2693         /* set mcast group membership again for update_mcft
2694            (unfortunately: osm_mcast_mgr_find_root_switch resets it)
2695          */
2696         update_mgrp_membership(&mcastgrp_port_list);
2697
2698         /* update the mcast forwarding tables of the switches */
2699         err = update_mcft(sm, adj_list, adj_list_size, mbox->mlid,
2700                           &mcastgrp_port_map, root_sw);
2701         if (err) {
2702                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR AD53: "
2703                         "Update of mcast forwarding tables failed for group 0x%X\n",
2704                         mbox->mlid);
2705                 status = IB_ERROR;
2706                 goto Exit;
2707         }
2708
2709 Exit:
2710         reset_mgrp_membership(adj_list, adj_list_size);
2711         osm_mcast_drop_port_list(&mcastgrp_port_list);
2712         OSM_LOG_EXIT(sm->p_log);
2713         return status;
2714 }
2715
2716 /* called from extern in QP creation process to gain the the service level and
2717    the virtual lane respectively for a <s,d> pair
2718 */
2719 static uint8_t get_dfsssp_sl(void *context, uint8_t hint_for_default_sl,
2720                              const ib_net16_t slid, const ib_net16_t dlid)
2721 {
2722         dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
2723         osm_port_t *src_port, *dest_port;
2724         vltable_t *srcdest2vl_table = NULL;
2725         uint8_t *vl_split_count = NULL;
2726         osm_ucast_mgr_t *p_mgr = NULL;
2727         int32_t res = 0;
2728
2729         if (dfsssp_ctx
2730             && dfsssp_ctx->routing_type == OSM_ROUTING_ENGINE_TYPE_DFSSSP) {
2731                 p_mgr = (osm_ucast_mgr_t *) dfsssp_ctx->p_mgr;
2732                 srcdest2vl_table = (vltable_t *) (dfsssp_ctx->srcdest2vl_table);
2733                 vl_split_count = (uint8_t *) (dfsssp_ctx->vl_split_count);
2734         }
2735         else
2736                 return hint_for_default_sl;
2737
2738         src_port = osm_get_port_by_lid(p_mgr->p_subn, slid);
2739         if (!src_port)
2740                 return hint_for_default_sl;
2741
2742         dest_port = osm_get_port_by_lid(p_mgr->p_subn, dlid);
2743         if (!dest_port)
2744                 return hint_for_default_sl;
2745
2746         if (!srcdest2vl_table)
2747                 return hint_for_default_sl;
2748
2749         res = vltable_get_vl(srcdest2vl_table, slid, dlid);
2750
2751         /* we will randomly distribute the traffic over multiple VLs if
2752            necessary for good balancing; therefore vl_split_count provides
2753            the number of VLs to use for certain traffic
2754          */
2755         if (res > -1) {
2756                 if (vl_split_count[res] > 1)
2757                         return (uint8_t) (res + rand()%(vl_split_count[res]));
2758                 else
2759                         return (uint8_t) res;
2760         } else
2761                 return hint_for_default_sl;
2762 }
2763
2764 static dfsssp_context_t *dfsssp_context_create(osm_opensm_t * p_osm,
2765                                                osm_routing_engine_type_t
2766                                                routing_type)
2767 {
2768         dfsssp_context_t *dfsssp_ctx = NULL;
2769
2770         /* allocate memory */
2771         dfsssp_ctx = (dfsssp_context_t *) malloc(sizeof(dfsssp_context_t));
2772         if (dfsssp_ctx) {
2773                 /* set initial values */
2774                 dfsssp_ctx->routing_type = routing_type;
2775                 dfsssp_ctx->p_mgr = (osm_ucast_mgr_t *) & (p_osm->sm.ucast_mgr);
2776                 dfsssp_ctx->adj_list = NULL;
2777                 dfsssp_ctx->adj_list_size = 0;
2778                 dfsssp_ctx->srcdest2vl_table = NULL;
2779                 dfsssp_ctx->vl_split_count = NULL;
2780         } else {
2781                 OSM_LOG(p_osm->sm.ucast_mgr.p_log, OSM_LOG_ERROR,
2782                         "ERR AD04: cannot allocate memory for dfsssp_ctx in dfsssp_context_create\n");
2783                 return NULL;
2784         }
2785
2786         return dfsssp_ctx;
2787 }
2788
2789 static void dfsssp_context_destroy(void *context)
2790 {
2791         dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
2792         vertex_t *adj_list = (vertex_t *) (dfsssp_ctx->adj_list);
2793         uint32_t i = 0;
2794         link_t *link = NULL, *tmp = NULL;
2795
2796         /* free adj_list */
2797         for (i = 0; i < dfsssp_ctx->adj_list_size; i++) {
2798                 link = adj_list[i].links;
2799                 while (link) {
2800                         tmp = link;
2801                         link = link->next;
2802                         free(tmp);
2803                 }
2804         }
2805         free(adj_list);
2806         dfsssp_ctx->adj_list = NULL;
2807         dfsssp_ctx->adj_list_size = 0;
2808
2809         /* free srcdest2vl table and the split count information table
2810            (can be done, because dfsssp_context_destroy is called after
2811             osm_get_dfsssp_sl)
2812          */
2813         vltable_dealloc(&(dfsssp_ctx->srcdest2vl_table));
2814         dfsssp_ctx->srcdest2vl_table = NULL;
2815
2816         if (dfsssp_ctx->vl_split_count) {
2817                 free(dfsssp_ctx->vl_split_count);
2818                 dfsssp_ctx->vl_split_count = NULL;
2819         }
2820 }
2821
2822 static void delete(void *context)
2823 {
2824         if (!context)
2825                 return;
2826         dfsssp_context_destroy(context);
2827
2828         free(context);
2829 }
2830
2831 int osm_ucast_dfsssp_setup(struct osm_routing_engine *r, osm_opensm_t * p_osm)
2832 {
2833         /* create context container and add ucast management object */
2834         dfsssp_context_t *dfsssp_context =
2835             dfsssp_context_create(p_osm, OSM_ROUTING_ENGINE_TYPE_DFSSSP);
2836         if (!dfsssp_context) {
2837                 return 1;       /* alloc failed -> skip this routing */
2838         }
2839
2840         /* reset function pointers to dfsssp routines */
2841         r->context = (void *)dfsssp_context;
2842         r->build_lid_matrices = dfsssp_build_graph;
2843         r->ucast_build_fwd_tables = dfsssp_do_dijkstra_routing;
2844         r->mcast_build_stree = dfsssp_do_mcast_routing;
2845         r->path_sl = get_dfsssp_sl;
2846         r->destroy = delete;
2847
2848         /* we initialize with the current time to achieve a 'good' randomized
2849            assignment in get_dfsssp_sl(...)
2850          */
2851         srand(time(NULL));
2852
2853         return 0;
2854 }
2855
2856 int osm_ucast_sssp_setup(struct osm_routing_engine *r, osm_opensm_t * p_osm)
2857 {
2858         /* create context container and add ucast management object */
2859         dfsssp_context_t *dfsssp_context =
2860             dfsssp_context_create(p_osm, OSM_ROUTING_ENGINE_TYPE_SSSP);
2861         if (!dfsssp_context) {
2862                 return 1;       /* alloc failed -> skip this routing */
2863         }
2864
2865         /* reset function pointers to sssp routines */
2866         r->context = (void *)dfsssp_context;
2867         r->build_lid_matrices = dfsssp_build_graph;
2868         r->ucast_build_fwd_tables = dfsssp_do_dijkstra_routing;
2869         r->mcast_build_stree = dfsssp_do_mcast_routing;
2870         r->destroy = delete;
2871
2872         return 0;
2873 }