]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/ofed/opensm/opensm/osm_ucast_ftree.c
Merge llvm, clang, compiler-rt, libc++, libunwind, lld, lldb, and openmp
[FreeBSD/FreeBSD.git] / contrib / ofed / opensm / opensm / osm_ucast_ftree.c
1 /*
2  * Copyright (c) 2009 Simula Research Laboratory. All rights reserved.
3  * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
4  * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
5  * Copyright (c) 2002-2011 Mellanox Technologies LTD. All rights reserved.
6  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
7  *
8  * This software is available to you under a choice of one of two
9  * licenses.  You may choose to be licensed under the terms of the GNU
10  * General Public License (GPL) Version 2, available from the file
11  * COPYING in the main directory of this source tree, or the
12  * OpenIB.org BSD license below:
13  *
14  *     Redistribution and use in source and binary forms, with or
15  *     without modification, are permitted provided that the following
16  *     conditions are met:
17  *
18  *      - Redistributions of source code must retain the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer.
21  *
22  *      - Redistributions in binary form must reproduce the above
23  *        copyright notice, this list of conditions and the following
24  *        disclaimer in the documentation and/or other materials
25  *        provided with the distribution.
26  *
27  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
28  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
29  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
30  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
31  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
32  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
33  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34  * SOFTWARE.
35  *
36  */
37
38 /*
39  * Abstract:
40  *    Implementation of OpenSM FatTree routing
41  */
42
43 #if HAVE_CONFIG_H
44 #  include <config.h>
45 #endif
46
47 #include <stdlib.h>
48 #include <string.h>
49 #include <ctype.h>
50 #include <errno.h>
51 #include <iba/ib_types.h>
52 #include <complib/cl_qmap.h>
53 #include <complib/cl_debug.h>
54 #include <opensm/osm_file_ids.h>
55 #define FILE_ID OSM_FILE_UCAST_FTREE_C
56 #include <opensm/osm_opensm.h>
57 #include <opensm/osm_switch.h>
58
59 /*
60  * FatTree rank is bounded between 2 and 8:
61  *  - Tree of rank 1 has only trivial routing paths,
62  *    so no need to use FatTree routing.
63  *  - Why maximum rank is 8:
64  *    Each node (switch) is assigned a unique tuple.
65  *    Switches are stored in two cl_qmaps - one is
66  *    ordered by guid, and the other by a key that is
67  *    generated from tuple. Since cl_qmap supports only
68  *    a 64-bit key, the maximal tuple length is 8 bytes.
69  *    which means that maximal tree rank is 8.
70  * Note that the above also implies that each switch
71  * can have at max 255 up/down ports.
72  */
73
74 #define FAT_TREE_MIN_RANK 2
75 #define FAT_TREE_MAX_RANK 8
76
77 typedef enum {
78         FTREE_DIRECTION_DOWN = -1,
79         FTREE_DIRECTION_SAME,
80         FTREE_DIRECTION_UP
81 } ftree_direction_t;
82
83 /***************************************************
84  **
85  **  Forward references
86  **
87  ***************************************************/
88 struct ftree_sw_t_;
89 struct ftree_hca_t_;
90 struct ftree_port_t_;
91 struct ftree_port_group_t_;
92 struct ftree_fabric_t_;
93
94 /***************************************************
95  **
96  **  ftree_tuple_t definition
97  **
98  ***************************************************/
99
100 #define FTREE_TUPLE_BUFF_LEN 1024
101 #define FTREE_TUPLE_LEN 8
102
103 typedef uint8_t ftree_tuple_t[FTREE_TUPLE_LEN];
104 typedef uint64_t ftree_tuple_key_t;
105
106 /***************************************************
107  **
108  **  ftree_sw_table_element_t definition
109  **
110  ***************************************************/
111
112 typedef struct {
113         cl_map_item_t map_item;
114         struct ftree_sw_t_ *p_sw;
115 } ftree_sw_tbl_element_t;
116
117 /***************************************************
118  **
119  **  ftree_port_t definition
120  **
121  ***************************************************/
122
123 typedef struct ftree_port_t_ {
124         cl_map_item_t map_item;
125         uint8_t port_num;       /* port number on the current node */
126         uint8_t remote_port_num;        /* port number on the remote node */
127         uint32_t counter_up;    /* number of allocated routes upwards */
128         uint32_t counter_down;  /* number of allocated routes downwards */
129 } ftree_port_t;
130
131 /***************************************************
132  **
133  **  ftree_port_group_t definition
134  **
135  ***************************************************/
136
137 typedef union ftree_hca_or_sw_ {
138         struct ftree_hca_t_ *p_hca;
139         struct ftree_sw_t_ *p_sw;
140 } ftree_hca_or_sw;
141
142 typedef struct ftree_port_group_t_ {
143         cl_map_item_t map_item;
144         uint16_t lid;   /* lid of the current node */
145         uint16_t remote_lid;    /* lid of the remote node */
146         ib_net64_t port_guid;   /* port guid of this port */
147         ib_net64_t node_guid;   /* this node's guid */
148         uint8_t node_type;      /* this node's type */
149         ib_net64_t remote_port_guid;    /* port guid of the remote port */
150         ib_net64_t remote_node_guid;    /* node guid of the remote node */
151         uint8_t remote_node_type;       /* IB_NODE_TYPE_{CA,SWITCH,ROUTER,...} */
152         ftree_hca_or_sw hca_or_sw;      /* pointer to this hca/switch */
153         ftree_hca_or_sw remote_hca_or_sw;       /* pointer to remote hca/switch */
154         cl_ptr_vector_t ports;  /* vector of ports to the same lid */
155         boolean_t is_cn;        /* whether this port is a compute node */
156         boolean_t is_io;        /* whether this port is an I/O node */
157         uint32_t counter_down;  /* number of allocated routes downwards */
158         uint32_t counter_up;    /* number of allocated routes upwards */
159 } ftree_port_group_t;
160
161 /***************************************************
162  **
163  **  ftree_sw_t definition
164  **
165  ***************************************************/
166
167 typedef struct ftree_sw_t_ {
168         cl_map_item_t map_item;
169         osm_switch_t *p_osm_sw;
170         uint32_t rank;
171         ftree_tuple_t tuple;
172         uint16_t lid;
173         ftree_port_group_t **down_port_groups;
174         uint8_t down_port_groups_num;
175         ftree_port_group_t **sibling_port_groups;
176         uint8_t sibling_port_groups_num;
177         ftree_port_group_t **up_port_groups;
178         uint8_t up_port_groups_num;
179         boolean_t is_leaf;
180         unsigned down_port_groups_idx;
181         uint8_t *hops;
182         uint32_t min_counter_down;
183         boolean_t counter_up_changed;
184 } ftree_sw_t;
185
186 /***************************************************
187  **
188  **  ftree_hca_t definition
189  **
190  ***************************************************/
191
192 typedef struct ftree_hca_t_ {
193         cl_map_item_t map_item;
194         osm_node_t *p_osm_node;
195         ftree_port_group_t **up_port_groups;
196         uint8_t *disconnected_ports;
197         uint16_t up_port_groups_num;
198         unsigned cn_num;
199 } ftree_hca_t;
200
201 /***************************************************
202  **
203  **  ftree_fabric_t definition
204  **
205  ***************************************************/
206
207 typedef struct ftree_fabric_t_ {
208         osm_opensm_t *p_osm;
209         osm_subn_t *p_subn;
210         cl_qmap_t hca_tbl;
211         cl_qmap_t sw_tbl;
212         cl_qmap_t sw_by_tuple_tbl;
213         cl_qmap_t cn_guid_tbl;
214         cl_qmap_t io_guid_tbl;
215         unsigned cn_num;
216         unsigned ca_ports;
217         uint8_t leaf_switch_rank;
218         uint8_t max_switch_rank;
219         ftree_sw_t **leaf_switches;
220         uint32_t leaf_switches_num;
221         uint16_t max_cn_per_leaf;
222         uint16_t lft_max_lid;
223         boolean_t fabric_built;
224 } ftree_fabric_t;
225
226 static inline osm_subn_t *ftree_get_subnet(IN ftree_fabric_t * p_ftree)
227 {
228         return p_ftree->p_subn;
229 }
230
231 /***************************************************
232  **
233  ** comparators
234  **
235  ***************************************************/
236
237 static int compare_switches_by_index(IN const void *p1, IN const void *p2)
238 {
239         ftree_sw_t **pp_sw1 = (ftree_sw_t **) p1;
240         ftree_sw_t **pp_sw2 = (ftree_sw_t **) p2;
241
242         uint16_t i;
243         for (i = 0; i < FTREE_TUPLE_LEN; i++) {
244                 if ((*pp_sw1)->tuple[i] > (*pp_sw2)->tuple[i])
245                         return 1;
246                 if ((*pp_sw1)->tuple[i] < (*pp_sw2)->tuple[i])
247                         return -1;
248         }
249         return 0;
250 }
251
252 /***************************************************/
253
254 static int
255 compare_port_groups_by_remote_switch_index(IN const void *p1, IN const void *p2)
256 {
257         ftree_port_group_t **pp_g1 = (ftree_port_group_t **) p1;
258         ftree_port_group_t **pp_g2 = (ftree_port_group_t **) p2;
259
260         return
261             compare_switches_by_index(&((*pp_g1)->remote_hca_or_sw.p_sw),
262                                       &((*pp_g2)->remote_hca_or_sw.p_sw));
263 }
264
265 /***************************************************
266  **
267  ** ftree_tuple_t functions
268  **
269  ***************************************************/
270
271 static void tuple_init(IN ftree_tuple_t tuple)
272 {
273         memset(tuple, 0xFF, FTREE_TUPLE_LEN);
274 }
275
276 /***************************************************/
277
278 static inline boolean_t tuple_assigned(IN ftree_tuple_t tuple)
279 {
280         return (tuple[0] != 0xFF);
281 }
282
283 /***************************************************/
284
285 #define FTREE_TUPLE_BUFFERS_NUM 6
286
287 static const char *tuple_to_str(IN ftree_tuple_t tuple)
288 {
289         static char buffer[FTREE_TUPLE_BUFFERS_NUM][FTREE_TUPLE_BUFF_LEN];
290         static uint8_t ind = 0;
291         char *ret_buffer;
292         uint32_t i;
293
294         if (!tuple_assigned(tuple))
295                 return "INDEX.NOT.ASSIGNED";
296
297         buffer[ind][0] = '\0';
298
299         for (i = 0; (i < FTREE_TUPLE_LEN) && (tuple[i] != 0xFF); i++) {
300                 if ((strlen(buffer[ind]) + 10) > FTREE_TUPLE_BUFF_LEN)
301                         return "INDEX.TOO.LONG";
302                 if (i != 0)
303                         strcat(buffer[ind], ".");
304                 sprintf(&buffer[ind][strlen(buffer[ind])], "%u", tuple[i]);
305         }
306
307         ret_buffer = buffer[ind];
308         ind = (ind + 1) % FTREE_TUPLE_BUFFERS_NUM;
309         return ret_buffer;
310 }                               /* tuple_to_str() */
311
312 /***************************************************/
313
314 static inline ftree_tuple_key_t tuple_to_key(IN ftree_tuple_t tuple)
315 {
316         ftree_tuple_key_t key;
317         memcpy(&key, tuple, FTREE_TUPLE_LEN);
318         return key;
319 }
320
321 /***************************************************/
322
323 static inline void tuple_from_key(IN ftree_tuple_t tuple,
324                                   IN ftree_tuple_key_t key)
325 {
326         memcpy(tuple, &key, FTREE_TUPLE_LEN);
327 }
328
329 /***************************************************
330  **
331  ** ftree_sw_tbl_element_t functions
332  **
333  ***************************************************/
334
335 static ftree_sw_tbl_element_t *sw_tbl_element_create(IN ftree_sw_t * p_sw)
336 {
337         ftree_sw_tbl_element_t *p_element =
338             (ftree_sw_tbl_element_t *) malloc(sizeof(ftree_sw_tbl_element_t));
339         if (!p_element)
340                 return NULL;
341         memset(p_element, 0, sizeof(ftree_sw_tbl_element_t));
342
343         p_element->p_sw = p_sw;
344         return p_element;
345 }
346
347 /***************************************************/
348
349 static void sw_tbl_element_destroy(IN ftree_sw_tbl_element_t * p_element)
350 {
351         free(p_element);
352 }
353
354 /***************************************************
355  **
356  ** ftree_port_t functions
357  **
358  ***************************************************/
359
360 static ftree_port_t *port_create(IN uint8_t port_num,
361                                  IN uint8_t remote_port_num)
362 {
363         ftree_port_t *p_port = (ftree_port_t *) malloc(sizeof(ftree_port_t));
364         if (!p_port)
365                 return NULL;
366         memset(p_port, 0, sizeof(ftree_port_t));
367
368         p_port->port_num = port_num;
369         p_port->remote_port_num = remote_port_num;
370
371         return p_port;
372 }
373
374 /***************************************************/
375
376 static void port_destroy(IN ftree_port_t * p_port)
377 {
378         free(p_port);
379 }
380
381 /***************************************************
382  **
383  ** ftree_port_group_t functions
384  **
385  ***************************************************/
386
387 static ftree_port_group_t *port_group_create(IN uint16_t lid,
388                                              IN uint16_t remote_lid,
389                                              IN ib_net64_t port_guid,
390                                              IN ib_net64_t node_guid,
391                                              IN uint8_t node_type,
392                                              IN void *p_hca_or_sw,
393                                              IN ib_net64_t remote_port_guid,
394                                              IN ib_net64_t remote_node_guid,
395                                              IN uint8_t remote_node_type,
396                                              IN void *p_remote_hca_or_sw,
397                                              IN boolean_t is_cn,
398                                              IN boolean_t is_io)
399 {
400         ftree_port_group_t *p_group =
401             (ftree_port_group_t *) malloc(sizeof(ftree_port_group_t));
402         if (p_group == NULL)
403                 return NULL;
404         memset(p_group, 0, sizeof(ftree_port_group_t));
405
406         p_group->lid = lid;
407         p_group->remote_lid = remote_lid;
408         memcpy(&p_group->port_guid, &port_guid, sizeof(ib_net64_t));
409         memcpy(&p_group->node_guid, &node_guid, sizeof(ib_net64_t));
410         memcpy(&p_group->remote_port_guid, &remote_port_guid,
411                sizeof(ib_net64_t));
412         memcpy(&p_group->remote_node_guid, &remote_node_guid,
413                sizeof(ib_net64_t));
414
415         p_group->node_type = node_type;
416         switch (node_type) {
417         case IB_NODE_TYPE_CA:
418                 p_group->hca_or_sw.p_hca = (ftree_hca_t *) p_hca_or_sw;
419                 break;
420         case IB_NODE_TYPE_SWITCH:
421                 p_group->hca_or_sw.p_sw = (ftree_sw_t *) p_hca_or_sw;
422                 break;
423         default:
424                 /* we shouldn't get here - port is created only in hca or switch */
425                 CL_ASSERT(0);
426         }
427
428         p_group->remote_node_type = remote_node_type;
429         switch (remote_node_type) {
430         case IB_NODE_TYPE_CA:
431                 p_group->remote_hca_or_sw.p_hca =
432                     (ftree_hca_t *) p_remote_hca_or_sw;
433                 break;
434         case IB_NODE_TYPE_SWITCH:
435                 p_group->remote_hca_or_sw.p_sw =
436                     (ftree_sw_t *) p_remote_hca_or_sw;
437                 break;
438         default:
439                 /* we shouldn't get here - port is created only in hca or switch */
440                 CL_ASSERT(0);
441         }
442
443         cl_ptr_vector_init(&p_group->ports, 0,  /* min size */
444                            8);  /* grow size */
445         p_group->is_cn = is_cn;
446         p_group->is_io = is_io;
447         return p_group;
448 }                               /* port_group_create() */
449
450 /***************************************************/
451
452 static void port_group_destroy(IN ftree_port_group_t * p_group)
453 {
454         uint32_t i;
455         uint32_t size;
456         ftree_port_t *p_port;
457
458         if (!p_group)
459                 return;
460
461         /* remove all the elements of p_group->ports vector */
462         size = cl_ptr_vector_get_size(&p_group->ports);
463         for (i = 0; i < size; i++)
464                 if (cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port) == CL_SUCCESS)
465                         port_destroy(p_port);
466
467         cl_ptr_vector_destroy(&p_group->ports);
468         free(p_group);
469 }                               /* port_group_destroy() */
470
471 /***************************************************/
472
473 static void port_group_dump(IN ftree_fabric_t * p_ftree,
474                             IN ftree_port_group_t * p_group,
475                             IN ftree_direction_t direction)
476 {
477         ftree_port_t *p_port;
478         uint32_t size;
479         uint32_t i;
480         char *buff;
481
482         if (!p_group)
483                 return;
484
485         if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
486                 return;
487
488         size = cl_ptr_vector_get_size(&p_group->ports);
489
490         buff = calloc(10, 1024);
491         if (!buff) {
492                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB33: "
493                         "Failed to allocate buffer\n");
494                 return;
495         }
496
497         for (i = 0; i < size; i++) {
498                 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
499                 CL_ASSERT(p_port);
500
501                 if (i != 0)
502                         strcat(buff, ", ");
503                 sprintf(buff + strlen(buff), "%u", p_port->port_num);
504         }
505
506         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
507                 "    Port Group of size %u, port(s): %s, direction: %s\n"
508                 "                  Local <--> Remote GUID (LID):"
509                 "0x%016" PRIx64 " (0x%04x) <--> 0x%016" PRIx64 " (0x%04x)\n",
510                 size, buff,
511                 (direction == FTREE_DIRECTION_DOWN) ? "DOWN" : (direction ==
512                                                                 FTREE_DIRECTION_SAME)
513                 ? "SIBLING" : "UP", cl_ntoh64(p_group->port_guid),
514                 p_group->lid, cl_ntoh64(p_group->remote_port_guid),
515                 p_group->remote_lid);
516
517         free(buff);
518
519 }                               /* port_group_dump() */
520
521 /***************************************************/
522
523 static void port_group_add_port(IN ftree_port_group_t * p_group,
524                                 IN uint8_t port_num, IN uint8_t remote_port_num)
525 {
526         uint16_t i;
527         ftree_port_t *p_port;
528
529         for (i = 0; i < cl_ptr_vector_get_size(&p_group->ports); i++) {
530                 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
531                 if (p_port->port_num == port_num)
532                         return;
533         }
534
535         p_port = port_create(port_num, remote_port_num);
536         CL_ASSERT(p_port);
537         cl_ptr_vector_insert(&p_group->ports, p_port, NULL);
538 }
539
540 /***************************************************
541  **
542  ** ftree_sw_t functions
543  **
544  ***************************************************/
545
546 static ftree_sw_t *sw_create(IN osm_switch_t * p_osm_sw)
547 {
548         ftree_sw_t *p_sw;
549         uint8_t ports_num;
550
551         /* make sure that the switch has ports */
552         if (p_osm_sw->num_ports == 1)
553                 return NULL;
554
555         p_sw = (ftree_sw_t *) malloc(sizeof(ftree_sw_t));
556         if (p_sw == NULL)
557                 return NULL;
558         memset(p_sw, 0, sizeof(ftree_sw_t));
559
560         p_sw->p_osm_sw = p_osm_sw;
561         p_sw->rank = 0xFFFFFFFF;
562         tuple_init(p_sw->tuple);
563
564         p_sw->lid =
565             cl_ntoh16(osm_node_get_base_lid(p_sw->p_osm_sw->p_node, 0));
566
567         ports_num = osm_node_get_num_physp(p_sw->p_osm_sw->p_node);
568         p_sw->down_port_groups =
569             (ftree_port_group_t **) malloc(ports_num *
570                                            sizeof(ftree_port_group_t *));
571         if (p_sw->down_port_groups == NULL)
572                 goto FREE_P_SW;
573         memset(p_sw->down_port_groups, 0, ports_num * sizeof(ftree_port_group_t *));
574
575         p_sw->up_port_groups =
576             (ftree_port_group_t **) malloc(ports_num *
577                                            sizeof(ftree_port_group_t *));
578         if (p_sw->up_port_groups == NULL)
579                 goto FREE_DOWN;
580         memset(p_sw->up_port_groups, 0, ports_num * sizeof(ftree_port_group_t *));
581
582         p_sw->sibling_port_groups =
583             (ftree_port_group_t **) malloc(ports_num *
584                                            sizeof(ftree_port_group_t *));
585         if (p_sw->sibling_port_groups == NULL)
586                 goto FREE_UP;
587         memset(p_sw->sibling_port_groups, 0, ports_num * sizeof(ftree_port_group_t *));
588
589         /* initialize lft buffer */
590         memset(p_osm_sw->new_lft, OSM_NO_PATH, p_osm_sw->lft_size);
591         p_sw->hops = malloc((p_osm_sw->max_lid_ho + 1) * sizeof(*(p_sw->hops)));
592         if (p_sw->hops == NULL)
593                 goto FREE_SIBLING;
594
595         memset(p_sw->hops, OSM_NO_PATH, p_osm_sw->max_lid_ho + 1);
596
597         return p_sw;
598
599 FREE_SIBLING:
600         free(p_sw->sibling_port_groups);
601 FREE_UP:
602         free(p_sw->up_port_groups);
603 FREE_DOWN:
604         free(p_sw->down_port_groups);
605 FREE_P_SW:
606         free(p_sw);
607         return NULL;
608 }                               /* sw_create() */
609
610 /***************************************************/
611
612 static void sw_destroy(IN ftree_sw_t * p_sw)
613 {
614         uint8_t i;
615
616         if (!p_sw)
617                 return;
618         free(p_sw->hops);
619
620         for (i = 0; i < p_sw->down_port_groups_num; i++)
621                 port_group_destroy(p_sw->down_port_groups[i]);
622         for (i = 0; i < p_sw->sibling_port_groups_num; i++)
623                 port_group_destroy(p_sw->sibling_port_groups[i]);
624         for (i = 0; i < p_sw->up_port_groups_num; i++)
625                 port_group_destroy(p_sw->up_port_groups[i]);
626         free(p_sw->down_port_groups);
627         free(p_sw->sibling_port_groups);
628         free(p_sw->up_port_groups);
629
630         free(p_sw);
631 }                               /* sw_destroy() */
632
633 /***************************************************/
634
635 static uint64_t sw_get_guid_no(IN ftree_sw_t * p_sw)
636 {
637         if (!p_sw)
638                 return 0;
639         return osm_node_get_node_guid(p_sw->p_osm_sw->p_node);
640 }
641
642 /***************************************************/
643
644 static uint64_t sw_get_guid_ho(IN ftree_sw_t * p_sw)
645 {
646         return cl_ntoh64(sw_get_guid_no(p_sw));
647 }
648
649 /***************************************************/
650
651 static void sw_dump(IN ftree_fabric_t * p_ftree, IN ftree_sw_t * p_sw)
652 {
653         uint32_t i;
654
655         if (!p_sw)
656                 return;
657
658         if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
659                 return;
660
661         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
662                 "Switch index: %s, GUID: 0x%016" PRIx64
663                 ", Ports: %u DOWN, %u SIBLINGS, %u UP\n",
664                 tuple_to_str(p_sw->tuple), sw_get_guid_ho(p_sw),
665                 p_sw->down_port_groups_num, p_sw->sibling_port_groups_num,
666                 p_sw->up_port_groups_num);
667
668         for (i = 0; i < p_sw->down_port_groups_num; i++)
669                 port_group_dump(p_ftree, p_sw->down_port_groups[i],
670                                 FTREE_DIRECTION_DOWN);
671         for (i = 0; i < p_sw->sibling_port_groups_num; i++)
672                 port_group_dump(p_ftree, p_sw->sibling_port_groups[i],
673                                 FTREE_DIRECTION_SAME);
674         for (i = 0; i < p_sw->up_port_groups_num; i++)
675                 port_group_dump(p_ftree, p_sw->up_port_groups[i],
676                                 FTREE_DIRECTION_UP);
677
678 }                               /* sw_dump() */
679
680 /***************************************************/
681
682 static boolean_t sw_ranked(IN ftree_sw_t * p_sw)
683 {
684         return (p_sw->rank != 0xFFFFFFFF);
685 }
686
687 /***************************************************/
688
689 static ftree_port_group_t *sw_get_port_group_by_remote_lid(IN ftree_sw_t * p_sw,
690                                                            IN uint16_t
691                                                            remote_lid,
692                                                            IN ftree_direction_t
693                                                            direction)
694 {
695         uint32_t i;
696         uint32_t size;
697         ftree_port_group_t **port_groups;
698
699         if (direction == FTREE_DIRECTION_UP) {
700                 port_groups = p_sw->up_port_groups;
701                 size = p_sw->up_port_groups_num;
702         } else if (direction == FTREE_DIRECTION_SAME) {
703                 port_groups = p_sw->sibling_port_groups;
704                 size = p_sw->sibling_port_groups_num;
705         } else {
706                 port_groups = p_sw->down_port_groups;
707                 size = p_sw->down_port_groups_num;
708         }
709
710         for (i = 0; i < size; i++)
711                 if (remote_lid == port_groups[i]->remote_lid)
712                         return port_groups[i];
713
714         return NULL;
715 }                               /* sw_get_port_group_by_remote_lid() */
716
717 /***************************************************/
718
719 static void sw_add_port(IN ftree_sw_t * p_sw, IN uint8_t port_num,
720                         IN uint8_t remote_port_num, IN uint16_t lid,
721                         IN uint16_t remote_lid, IN ib_net64_t port_guid,
722                         IN ib_net64_t remote_port_guid,
723                         IN ib_net64_t remote_node_guid,
724                         IN uint8_t remote_node_type,
725                         IN void *p_remote_hca_or_sw,
726                         IN ftree_direction_t direction)
727 {
728         ftree_port_group_t *p_group =
729             sw_get_port_group_by_remote_lid(p_sw, remote_lid, direction);
730
731         if (!p_group) {
732                 p_group = port_group_create(lid, remote_lid,
733                                             port_guid, sw_get_guid_no(p_sw),
734                                             IB_NODE_TYPE_SWITCH, p_sw,
735                                             remote_port_guid, remote_node_guid,
736                                             remote_node_type,
737                                             p_remote_hca_or_sw, FALSE, FALSE);
738                 CL_ASSERT(p_group);
739
740                 if (direction == FTREE_DIRECTION_UP) {
741                         p_sw->up_port_groups[p_sw->up_port_groups_num++] =
742                             p_group;
743                 } else if (direction == FTREE_DIRECTION_SAME) {
744                         p_sw->
745                             sibling_port_groups[p_sw->sibling_port_groups_num++]
746                             = p_group;
747                 } else
748                         p_sw->down_port_groups[p_sw->down_port_groups_num++] =
749                             p_group;
750         }
751         port_group_add_port(p_group, port_num, remote_port_num);
752
753 }                               /* sw_add_port() */
754
755 /***************************************************/
756
757 static inline cl_status_t sw_set_hops(IN ftree_sw_t * p_sw, IN uint16_t lid,
758                                       IN uint8_t port_num, IN uint8_t hops,
759                                       IN boolean_t is_target_sw)
760 {
761         /* set local min hop table(LID) */
762         p_sw->hops[lid] = hops;
763         if (is_target_sw)
764                 return osm_switch_set_hops(p_sw->p_osm_sw, lid, port_num, hops);
765         return 0;
766 }
767
768 /***************************************************/
769
770 static int set_hops_on_remote_sw(IN ftree_port_group_t * p_group,
771                                  IN uint16_t target_lid, IN uint8_t hops,
772                                  IN boolean_t is_target_sw)
773 {
774         ftree_port_t *p_port;
775         uint8_t i, ports_num;
776         ftree_sw_t *p_remote_sw = p_group->remote_hca_or_sw.p_sw;
777
778         /* if lid is a switch, we set the min hop table in the osm_switch struct */
779         CL_ASSERT(p_group->remote_node_type == IB_NODE_TYPE_SWITCH);
780         p_remote_sw->hops[target_lid] = hops;
781
782         /* If target lid is a switch we set the min hop table values
783          * for each port on the associated osm_sw struct */
784         if (!is_target_sw)
785                 return 0;
786
787         ports_num = (uint8_t) cl_ptr_vector_get_size(&p_group->ports);
788         for (i = 0; i < ports_num; i++) {
789                 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
790                 if (sw_set_hops(p_remote_sw, target_lid,
791                                 p_port->remote_port_num, hops, is_target_sw))
792                         return -1;
793         }
794         return 0;
795 }
796
797 /***************************************************/
798
799 static inline uint8_t
800 sw_get_least_hops(IN ftree_sw_t * p_sw, IN uint16_t target_lid)
801 {
802         CL_ASSERT(p_sw->hops != NULL);
803         return p_sw->hops[target_lid];
804 }
805
806 /***************************************************
807  **
808  ** ftree_hca_t functions
809  **
810  ***************************************************/
811
812 static ftree_hca_t *hca_create(IN osm_node_t * p_osm_node)
813 {
814         ftree_hca_t *p_hca = (ftree_hca_t *) malloc(sizeof(ftree_hca_t));
815         if (p_hca == NULL)
816                 return NULL;
817         memset(p_hca, 0, sizeof(ftree_hca_t));
818
819         p_hca->p_osm_node = p_osm_node;
820         p_hca->up_port_groups = (ftree_port_group_t **)
821             malloc(osm_node_get_num_physp(p_hca->p_osm_node) *
822                    sizeof(ftree_port_group_t *));
823         if (!p_hca->up_port_groups) {
824                 free(p_hca);
825                 return NULL;
826         }
827         memset(p_hca->up_port_groups, 0, osm_node_get_num_physp(p_hca->p_osm_node) *
828                sizeof(ftree_port_group_t *));
829
830         p_hca->disconnected_ports = (uint8_t *)
831             calloc(osm_node_get_num_physp(p_hca->p_osm_node) + 1, sizeof(uint8_t));
832         if (!p_hca->disconnected_ports) {
833                 free(p_hca->up_port_groups);
834                 free(p_hca);
835                 return NULL;
836         }
837         p_hca->up_port_groups_num = 0;
838         return p_hca;
839 }
840
841 /***************************************************/
842
843 static void hca_destroy(IN ftree_hca_t * p_hca)
844 {
845         uint32_t i;
846
847         if (!p_hca)
848                 return;
849
850         for (i = 0; i < p_hca->up_port_groups_num; i++)
851                 port_group_destroy(p_hca->up_port_groups[i]);
852
853         free(p_hca->up_port_groups);
854         free(p_hca->disconnected_ports);
855
856         free(p_hca);
857 }
858
859 /***************************************************/
860
861 static uint64_t hca_get_guid_no(IN ftree_hca_t * p_hca)
862 {
863         if (!p_hca)
864                 return 0;
865         return osm_node_get_node_guid(p_hca->p_osm_node);
866 }
867
868 /***************************************************/
869
870 static uint64_t hca_get_guid_ho(IN ftree_hca_t * p_hca)
871 {
872         return cl_ntoh64(hca_get_guid_no(p_hca));
873 }
874
875 /***************************************************/
876
877 static void hca_dump(IN ftree_fabric_t * p_ftree, IN ftree_hca_t * p_hca)
878 {
879         uint32_t i;
880
881         if (!p_hca)
882                 return;
883
884         if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
885                 return;
886
887         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
888                 "CA GUID: 0x%016" PRIx64 ", Ports: %u UP\n",
889                 hca_get_guid_ho(p_hca), p_hca->up_port_groups_num);
890
891         for (i = 0; i < p_hca->up_port_groups_num; i++)
892                 port_group_dump(p_ftree, p_hca->up_port_groups[i],
893                                 FTREE_DIRECTION_UP);
894 }
895
896 static ftree_port_group_t *hca_get_port_group_by_lid(IN ftree_hca_t *
897                                                      p_hca,
898                                                      IN uint16_t
899                                                      lid)
900 {
901         uint32_t i;
902         for (i = 0; i < p_hca->up_port_groups_num; i++)
903                 if (lid ==
904                     p_hca->up_port_groups[i]->lid)
905                         return p_hca->up_port_groups[i];
906
907         return NULL;
908 }
909 /***************************************************/
910
911 static void hca_add_port(IN ftree_fabric_t * p_ftree,
912                          IN ftree_hca_t * p_hca, IN uint8_t port_num,
913                          IN uint8_t remote_port_num, IN uint16_t lid,
914                          IN uint16_t remote_lid, IN ib_net64_t port_guid,
915                          IN ib_net64_t remote_port_guid,
916                          IN ib_net64_t remote_node_guid,
917                          IN uint8_t remote_node_type,
918                          IN void *p_remote_hca_or_sw, IN boolean_t is_cn,
919                          IN boolean_t is_io)
920 {
921         ftree_port_group_t *p_group;
922
923         /* this function is supposed to be called only for adding ports
924            in hca's that lead to switches */
925         CL_ASSERT(remote_node_type == IB_NODE_TYPE_SWITCH);
926
927         p_group = hca_get_port_group_by_lid(p_hca, lid);
928
929         if (!p_group) {
930                 p_group = port_group_create(lid, remote_lid,
931                                             port_guid, hca_get_guid_no(p_hca),
932                                             IB_NODE_TYPE_CA, p_hca,
933                                             remote_port_guid, remote_node_guid,
934                                             remote_node_type,
935                                             p_remote_hca_or_sw, is_cn, is_io);
936                 CL_ASSERT(p_group);
937                 p_hca->up_port_groups[p_hca->up_port_groups_num++] = p_group;
938                 port_group_add_port(p_group, port_num, remote_port_num);
939         } else
940                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
941                         "ERR AB32: Duplicated LID for CA GUID: 0x%016" PRIx64 "\n",
942                         cl_ntoh64(port_guid));
943 }                               /* hca_add_port() */
944
945 /***************************************************
946  **
947  ** ftree_fabric_t functions
948  **
949  ***************************************************/
950
951 static ftree_fabric_t *fabric_create()
952 {
953         ftree_fabric_t *p_ftree =
954             (ftree_fabric_t *) malloc(sizeof(ftree_fabric_t));
955         if (p_ftree == NULL)
956                 return NULL;
957
958         memset(p_ftree, 0, sizeof(ftree_fabric_t));
959
960         cl_qmap_init(&p_ftree->hca_tbl);
961         cl_qmap_init(&p_ftree->sw_tbl);
962         cl_qmap_init(&p_ftree->sw_by_tuple_tbl);
963         cl_qmap_init(&p_ftree->cn_guid_tbl);
964         cl_qmap_init(&p_ftree->io_guid_tbl);
965
966         return p_ftree;
967 }
968
969 /***************************************************/
970
971 static void fabric_clear(ftree_fabric_t * p_ftree)
972 {
973         ftree_hca_t *p_hca;
974         ftree_hca_t *p_next_hca;
975         ftree_sw_t *p_sw;
976         ftree_sw_t *p_next_sw;
977         ftree_sw_tbl_element_t *p_element;
978         ftree_sw_tbl_element_t *p_next_element;
979         name_map_item_t *p_guid_element, *p_next_guid_element;
980
981         if (!p_ftree)
982                 return;
983
984         /* remove all the elements of hca_tbl */
985
986         p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
987         while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
988                 p_hca = p_next_hca;
989                 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
990                 hca_destroy(p_hca);
991         }
992         cl_qmap_remove_all(&p_ftree->hca_tbl);
993
994         /* remove all the elements of sw_tbl */
995
996         p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
997         while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
998                 p_sw = p_next_sw;
999                 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1000                 sw_destroy(p_sw);
1001         }
1002         cl_qmap_remove_all(&p_ftree->sw_tbl);
1003
1004         /* remove all the elements of sw_by_tuple_tbl */
1005
1006         p_next_element =
1007             (ftree_sw_tbl_element_t *) cl_qmap_head(&p_ftree->sw_by_tuple_tbl);
1008         while (p_next_element != (ftree_sw_tbl_element_t *)
1009                cl_qmap_end(&p_ftree->sw_by_tuple_tbl)) {
1010                 p_element = p_next_element;
1011                 p_next_element = (ftree_sw_tbl_element_t *)
1012                     cl_qmap_next(&p_element->map_item);
1013                 sw_tbl_element_destroy(p_element);
1014         }
1015         cl_qmap_remove_all(&p_ftree->sw_by_tuple_tbl);
1016
1017         /* remove all the elements of cn_guid_tbl */
1018         p_next_guid_element =
1019             (name_map_item_t *) cl_qmap_head(&p_ftree->cn_guid_tbl);
1020         while (p_next_guid_element !=
1021                (name_map_item_t *) cl_qmap_end(&p_ftree->cn_guid_tbl)) {
1022                 p_guid_element = p_next_guid_element;
1023                 p_next_guid_element =
1024                     (name_map_item_t *) cl_qmap_next(&p_guid_element->item);
1025                 free(p_guid_element);
1026         }
1027         cl_qmap_remove_all(&p_ftree->cn_guid_tbl);
1028
1029         /* remove all the elements of io_guid_tbl */
1030         p_next_guid_element =
1031             (name_map_item_t *) cl_qmap_head(&p_ftree->io_guid_tbl);
1032         while (p_next_guid_element !=
1033                (name_map_item_t *) cl_qmap_end(&p_ftree->io_guid_tbl)) {
1034                 p_guid_element = p_next_guid_element;
1035                 p_next_guid_element =
1036                     (name_map_item_t *) cl_qmap_next(&p_guid_element->item);
1037                 free(p_guid_element);
1038         }
1039         cl_qmap_remove_all(&p_ftree->io_guid_tbl);
1040
1041         /* free the leaf switches array */
1042         if ((p_ftree->leaf_switches_num > 0) && (p_ftree->leaf_switches))
1043                 free(p_ftree->leaf_switches);
1044
1045         p_ftree->leaf_switches_num = 0;
1046         p_ftree->cn_num = 0;
1047         p_ftree->ca_ports = 0;
1048         p_ftree->leaf_switch_rank = 0;
1049         p_ftree->max_switch_rank = 0;
1050         p_ftree->max_cn_per_leaf = 0;
1051         p_ftree->lft_max_lid = 0;
1052         p_ftree->leaf_switches = NULL;
1053         p_ftree->fabric_built = FALSE;
1054
1055 }                               /* fabric_destroy() */
1056
1057 /***************************************************/
1058
1059 static void fabric_destroy(ftree_fabric_t * p_ftree)
1060 {
1061         if (!p_ftree)
1062                 return;
1063         fabric_clear(p_ftree);
1064         free(p_ftree);
1065 }
1066
1067 /***************************************************/
1068
1069 static uint8_t fabric_get_rank(ftree_fabric_t * p_ftree)
1070 {
1071         return p_ftree->leaf_switch_rank + 1;
1072 }
1073
1074 /***************************************************/
1075
1076 static void fabric_add_hca(ftree_fabric_t * p_ftree, osm_node_t * p_osm_node)
1077 {
1078         ftree_hca_t *p_hca;
1079
1080         CL_ASSERT(osm_node_get_type(p_osm_node) == IB_NODE_TYPE_CA);
1081
1082         p_hca = hca_create(p_osm_node);
1083         if (!p_hca)
1084                 return;
1085
1086         cl_qmap_insert(&p_ftree->hca_tbl, p_osm_node->node_info.node_guid,
1087                        &p_hca->map_item);
1088 }
1089
1090 /***************************************************/
1091
1092 static void fabric_add_sw(ftree_fabric_t * p_ftree, osm_switch_t * p_osm_sw)
1093 {
1094         ftree_sw_t *p_sw;
1095
1096         CL_ASSERT(osm_node_get_type(p_osm_sw->p_node) == IB_NODE_TYPE_SWITCH);
1097
1098         p_sw = sw_create(p_osm_sw);
1099         if (!p_sw)
1100                 return;
1101
1102         cl_qmap_insert(&p_ftree->sw_tbl, p_osm_sw->p_node->node_info.node_guid,
1103                        &p_sw->map_item);
1104
1105         /* track the max lid (in host order) that exists in the fabric */
1106         if (p_sw->lid > p_ftree->lft_max_lid)
1107                 p_ftree->lft_max_lid = p_sw->lid;
1108 }
1109
1110 /***************************************************/
1111
1112 static void fabric_add_sw_by_tuple(IN ftree_fabric_t * p_ftree,
1113                                    IN ftree_sw_t * p_sw)
1114 {
1115         CL_ASSERT(tuple_assigned(p_sw->tuple));
1116
1117         cl_qmap_insert(&p_ftree->sw_by_tuple_tbl, tuple_to_key(p_sw->tuple),
1118                        &sw_tbl_element_create(p_sw)->map_item);
1119 }
1120
1121 /***************************************************/
1122
1123 static ftree_sw_t *fabric_get_sw_by_tuple(IN ftree_fabric_t * p_ftree,
1124                                           IN ftree_tuple_t tuple)
1125 {
1126         ftree_sw_tbl_element_t *p_element;
1127
1128         CL_ASSERT(tuple_assigned(tuple));
1129
1130         tuple_to_key(tuple);
1131
1132         p_element =
1133             (ftree_sw_tbl_element_t *) cl_qmap_get(&p_ftree->sw_by_tuple_tbl,
1134                                                    tuple_to_key(tuple));
1135         if (p_element ==
1136             (ftree_sw_tbl_element_t *) cl_qmap_end(&p_ftree->sw_by_tuple_tbl))
1137                 return NULL;
1138
1139         return p_element->p_sw;
1140 }
1141
1142 /***************************************************/
1143
1144 static ftree_sw_t *fabric_get_sw_by_guid(IN ftree_fabric_t * p_ftree,
1145                                          IN uint64_t guid)
1146 {
1147         ftree_sw_t *p_sw;
1148         p_sw = (ftree_sw_t *) cl_qmap_get(&p_ftree->sw_tbl, guid);
1149         if (p_sw == (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl))
1150                 return NULL;
1151         return p_sw;
1152 }
1153
1154 /***************************************************/
1155
1156 static ftree_hca_t *fabric_get_hca_by_guid(IN ftree_fabric_t * p_ftree,
1157                                            IN uint64_t guid)
1158 {
1159         ftree_hca_t *p_hca;
1160         p_hca = (ftree_hca_t *) cl_qmap_get(&p_ftree->hca_tbl, guid);
1161         if (p_hca == (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl))
1162                 return NULL;
1163         return p_hca;
1164 }
1165
1166 /***************************************************/
1167
1168 static void fabric_dump(ftree_fabric_t * p_ftree)
1169 {
1170         uint32_t i;
1171         ftree_hca_t *p_hca;
1172         ftree_sw_t *p_sw;
1173
1174         if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
1175                 return;
1176
1177         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n"
1178                 "                       |-------------------------------|\n"
1179                 "                       |-  Full fabric topology dump  -|\n"
1180                 "                       |-------------------------------|\n\n");
1181
1182         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "-- CAs:\n");
1183
1184         for (p_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
1185              p_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl);
1186              p_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item)) {
1187                 hca_dump(p_ftree, p_hca);
1188         }
1189
1190         for (i = 0; i <= p_ftree->max_switch_rank; i++) {
1191                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1192                         "-- Rank %u switches\n", i);
1193                 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1194                      p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1195                      p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1196                         if (p_sw->rank == i)
1197                                 sw_dump(p_ftree, p_sw);
1198                 }
1199         }
1200
1201         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n"
1202                 "                       |---------------------------------------|\n"
1203                 "                       |- Full fabric topology dump completed -|\n"
1204                 "                       |---------------------------------------|\n\n");
1205 }                               /* fabric_dump() */
1206
1207 /***************************************************/
1208
1209 static void fabric_dump_general_info(IN ftree_fabric_t * p_ftree)
1210 {
1211         uint32_t i, j;
1212         ftree_sw_t *p_sw;
1213
1214         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1215                 "General fabric topology info\n");
1216         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1217                 "============================\n");
1218
1219         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1220                 "  - FatTree rank (roots to leaf switches): %u\n",
1221                 p_ftree->leaf_switch_rank + 1);
1222         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1223                 "  - FatTree max switch rank: %u\n", p_ftree->max_switch_rank);
1224         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1225                 "  - Fabric has %u CAs, %u CA ports (%u of them CNs), %u switches\n",
1226                 cl_qmap_count(&p_ftree->hca_tbl), p_ftree->ca_ports,
1227                 p_ftree->cn_num, cl_qmap_count(&p_ftree->sw_tbl));
1228
1229         CL_ASSERT(p_ftree->ca_ports >= p_ftree->cn_num);
1230
1231         for (i = 0; i <= p_ftree->max_switch_rank; i++) {
1232                 j = 0;
1233                 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1234                      p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1235                      p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1236                         if (p_sw->rank == i)
1237                                 j++;
1238                 }
1239                 if (i == 0)
1240                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1241                                 "  - Fabric has %u switches at rank %u (roots)\n",
1242                                 j, i);
1243                 else if (i == p_ftree->leaf_switch_rank)
1244                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1245                                 "  - Fabric has %u switches at rank %u (%u of them leafs)\n",
1246                                 j, i, p_ftree->leaf_switches_num);
1247                 else
1248                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1249                                 "  - Fabric has %u switches at rank %u\n", j,
1250                                 i);
1251         }
1252
1253         if (OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_VERBOSE)) {
1254                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1255                         "  - Root switches:\n");
1256                 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1257                      p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1258                      p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1259                         if (p_sw->rank == 0)
1260                                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1261                                         "      GUID: 0x%016" PRIx64
1262                                         ", LID: %u, Index %s\n",
1263                                         sw_get_guid_ho(p_sw),
1264                                         p_sw->lid,
1265                                         tuple_to_str(p_sw->tuple));
1266                 }
1267
1268                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1269                         "  - Leaf switches (sorted by index):\n");
1270                 for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1271                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1272                                 "      GUID: 0x%016" PRIx64
1273                                 ", LID: %u, Index %s\n",
1274                                 sw_get_guid_ho(p_ftree->leaf_switches[i]),
1275                                 p_ftree->leaf_switches[i]->lid,
1276                                 tuple_to_str(p_ftree->leaf_switches[i]->tuple));
1277                 }
1278         }
1279 }                               /* fabric_dump_general_info() */
1280
1281 /***************************************************/
1282
1283 static void fabric_dump_hca_ordering(IN ftree_fabric_t * p_ftree)
1284 {
1285         ftree_hca_t *p_hca;
1286         ftree_sw_t *p_sw;
1287         ftree_port_group_t *p_group_on_sw;
1288         ftree_port_group_t *p_group_on_hca;
1289         int rename_status = 0;
1290         uint32_t i;
1291         uint32_t j;
1292         unsigned printed_hcas_on_leaf;
1293
1294         char path[1024], path_tmp[1032];
1295         FILE *p_hca_ordering_file;
1296         const char *filename = "opensm-ftree-ca-order.dump";
1297
1298         snprintf(path, sizeof(path), "%s/%s",
1299                  p_ftree->p_osm->subn.opt.dump_files_dir, filename);
1300
1301         snprintf(path_tmp, sizeof(path_tmp), "%s.tmp", path);
1302
1303         p_hca_ordering_file = fopen(path_tmp, "w");
1304         if (!p_hca_ordering_file) {
1305                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB01: "
1306                         "cannot open file \'%s\': %s\n", path_tmp,
1307                         strerror(errno));
1308                 return;
1309         }
1310
1311         /* for each leaf switch (in indexing order) */
1312         for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1313                 p_sw = p_ftree->leaf_switches[i];
1314                 printed_hcas_on_leaf = 0;
1315
1316                 /* for each real CA (CNs and not) connected to this switch */
1317                 for (j = 0; j < p_sw->down_port_groups_num; j++) {
1318                         p_group_on_sw = p_sw->down_port_groups[j];
1319
1320                         if (p_group_on_sw->remote_node_type != IB_NODE_TYPE_CA)
1321                                 continue;
1322
1323                         p_hca = p_group_on_sw->remote_hca_or_sw.p_hca;
1324                         p_group_on_hca =
1325                             hca_get_port_group_by_lid(p_hca,
1326                                                       p_group_on_sw->
1327                                                       remote_lid);
1328
1329                         /* treat non-compute nodes as dummies */
1330                         if (!p_group_on_hca->is_cn)
1331                                 continue;
1332
1333                         fprintf(p_hca_ordering_file, "0x%04x\t%s\n",
1334                                 p_group_on_hca->lid,
1335                                 p_hca->p_osm_node->print_desc);
1336
1337                         printed_hcas_on_leaf++;
1338                 }
1339
1340                 /* now print missing HCAs */
1341                 for (j = 0;
1342                      j < (p_ftree->max_cn_per_leaf - printed_hcas_on_leaf); j++)
1343                         fprintf(p_hca_ordering_file, "0xFFFF\tDUMMY\n");
1344
1345         }
1346         /* done going through all the leaf switches */
1347
1348         fclose(p_hca_ordering_file);
1349
1350         rename_status = rename(path_tmp, path);
1351         if (rename_status) {
1352                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB03: "
1353                         "cannot rename file \'%s\': %s\n", path_tmp,
1354                         strerror(errno));
1355         }
1356 }                               /* fabric_dump_hca_ordering() */
1357
1358 /***************************************************/
1359
1360 static void fabric_assign_tuple(IN ftree_fabric_t * p_ftree,
1361                                 IN ftree_sw_t * p_sw,
1362                                 IN ftree_tuple_t new_tuple)
1363 {
1364         memcpy(p_sw->tuple, new_tuple, FTREE_TUPLE_LEN);
1365         fabric_add_sw_by_tuple(p_ftree, p_sw);
1366 }
1367
1368 /***************************************************/
1369
1370 static void fabric_assign_first_tuple(IN ftree_fabric_t * p_ftree,
1371                                       IN ftree_sw_t * p_sw,
1372                                       IN unsigned int subtree)
1373 {
1374         uint8_t i;
1375         ftree_tuple_t new_tuple;
1376
1377         if (p_ftree->leaf_switch_rank >= FTREE_TUPLE_LEN)
1378                 return;
1379
1380         tuple_init(new_tuple);
1381         new_tuple[0] = (uint8_t) p_sw->rank;
1382
1383         for (i = 1; i <= p_ftree->leaf_switch_rank; i++)
1384                 new_tuple[i] = 0;
1385
1386         if (p_sw->rank == 0) {
1387                 if (p_ftree->leaf_switch_rank > 1)
1388                         new_tuple[p_ftree->leaf_switch_rank] = subtree;
1389
1390                 for (i = 0; i < 0xFF; i++) {
1391                         new_tuple[1] = i;
1392                         if (fabric_get_sw_by_tuple(p_ftree, new_tuple) == NULL)
1393                                 break;
1394                 }
1395                 if (i == 0xFF) {
1396                         /* new tuple not found - there are more than 255 ports in one direction */
1397                         return;
1398                 }
1399         }
1400         fabric_assign_tuple(p_ftree, p_sw, new_tuple);
1401 }
1402
1403 /***************************************************/
1404
1405 static void fabric_get_new_tuple(IN ftree_fabric_t * p_ftree,
1406                                  OUT ftree_tuple_t new_tuple,
1407                                  IN ftree_tuple_t from_tuple,
1408                                  IN ftree_direction_t direction)
1409 {
1410         ftree_sw_t *p_sw;
1411         ftree_tuple_t temp_tuple;
1412         uint8_t var_index;
1413         uint8_t i;
1414
1415         tuple_init(new_tuple);
1416         memcpy(temp_tuple, from_tuple, FTREE_TUPLE_LEN);
1417
1418         if (direction == FTREE_DIRECTION_DOWN) {
1419                 temp_tuple[0]++;
1420                 var_index = from_tuple[0] + 1;
1421         } else {
1422                 temp_tuple[0]--;
1423                 var_index = from_tuple[0];
1424         }
1425
1426         for (i = 0; i < 0xFF; i++) {
1427                 temp_tuple[var_index] = i;
1428                 p_sw = fabric_get_sw_by_tuple(p_ftree, temp_tuple);
1429                 if (p_sw == NULL)       /* found free tuple */
1430                         break;
1431         }
1432
1433         if (i == 0xFF) {
1434                 /* new tuple not found - there are more than 255 ports in one direction */
1435                 return;
1436         }
1437         memcpy(new_tuple, temp_tuple, FTREE_TUPLE_LEN);
1438
1439 }                               /* fabric_get_new_tuple() */
1440
1441 /***************************************************/
1442
1443 static inline boolean_t fabric_roots_provided(IN ftree_fabric_t * p_ftree)
1444 {
1445         return (p_ftree->p_osm->subn.opt.root_guid_file != NULL);
1446 }
1447
1448 /***************************************************/
1449
1450 static inline boolean_t fabric_cns_provided(IN ftree_fabric_t * p_ftree)
1451 {
1452         return (p_ftree->p_osm->subn.opt.cn_guid_file != NULL);
1453 }
1454
1455 /***************************************************/
1456
1457 static inline boolean_t fabric_ios_provided(IN ftree_fabric_t * p_ftree)
1458 {
1459         return (p_ftree->p_osm->subn.opt.io_guid_file != NULL);
1460 }
1461
1462 /***************************************************/
1463
1464 static int fabric_mark_leaf_switches(IN ftree_fabric_t * p_ftree)
1465 {
1466         ftree_sw_t *p_sw;
1467         ftree_hca_t *p_hca;
1468         ftree_hca_t *p_next_hca;
1469         unsigned i;
1470         int res = 0;
1471
1472         OSM_LOG_ENTER(&p_ftree->p_osm->log);
1473
1474         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1475                 "Marking leaf switches in fabric\n");
1476
1477         /* Scan all the CAs, if they have CNs - find CN port and mark switch
1478            that is connected to this port as leaf switch.
1479            Also, ensure that this marked leaf has rank of p_ftree->leaf_switch_rank. */
1480         p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
1481         while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
1482                 p_hca = p_next_hca;
1483                 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
1484                 if (!p_hca->cn_num)
1485                         continue;
1486
1487                 for (i = 0; i < p_hca->up_port_groups_num; i++) {
1488                         if (!p_hca->up_port_groups[i]->is_cn)
1489                                 continue;
1490
1491                         /* In CAs, port group alway has one port, and since this
1492                            port group is CN, we know that this port is compute node */
1493                         CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type ==
1494                                   IB_NODE_TYPE_SWITCH);
1495                         p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw;
1496
1497                         /* check if this switch was already processed */
1498                         if (p_sw->is_leaf)
1499                                 continue;
1500                         p_sw->is_leaf = TRUE;
1501
1502                         /* ensure that this leaf switch is at the correct tree level */
1503                         if (p_sw->rank != p_ftree->leaf_switch_rank) {
1504                                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1505                                         "ERR AB26: CN port 0x%" PRIx64
1506                                         " is connected to switch 0x%" PRIx64
1507                                         " with rank %u, "
1508                                         "while FatTree leaf rank is %u\n",
1509                                         cl_ntoh64(p_hca->
1510                                                   up_port_groups[i]->port_guid),
1511                                         sw_get_guid_ho(p_sw), p_sw->rank,
1512                                         p_ftree->leaf_switch_rank);
1513                                 res = -1;
1514                                 goto Exit;
1515
1516                         }
1517                 }
1518         }
1519
1520 Exit:
1521         OSM_LOG_EXIT(&p_ftree->p_osm->log);
1522         return res;
1523 }                               /* fabric_mark_leaf_switches() */
1524
1525 /***************************************************/
1526 static void bfs_fabric_indexing(IN ftree_fabric_t * p_ftree,
1527                                 IN ftree_sw_t *p_first_sw)
1528 {
1529         ftree_sw_t *p_remote_sw;
1530         ftree_sw_t *p_sw = NULL;
1531         ftree_tuple_t new_tuple;
1532         uint32_t i;
1533         cl_list_t bfs_list;
1534
1535         OSM_LOG_ENTER(&p_ftree->p_osm->log);
1536         cl_list_init(&bfs_list, cl_qmap_count(&p_ftree->sw_tbl));
1537         /*
1538          * Now run BFS and assign indexes to all switches
1539          * Pseudo code of the algorithm is as follows:
1540          *
1541          *  * Add first switch to BFS queue
1542          *  * While (BFS queue not empty)
1543          *      - Pop the switch from the head of the queue
1544          *      - Scan all the downward and upward ports
1545          *      - For each port
1546          *          + Get the remote switch
1547          *          + Assign index to the remote switch
1548          *          + Add remote switch to the BFS queue
1549          */
1550
1551         cl_list_insert_tail(&bfs_list, p_first_sw);
1552
1553         while (!cl_is_list_empty(&bfs_list)) {
1554                 p_sw = (ftree_sw_t *) cl_list_remove_head(&bfs_list);
1555
1556                 /* Discover all the nodes from ports that are pointing down */
1557
1558                 if (p_sw->rank >= p_ftree->leaf_switch_rank) {
1559                         /* whether downward ports are pointing to CAs or switches,
1560                            we don't assign indexes to switches that are located
1561                            lower than leaf switches */
1562                 } else {
1563                         /* This is not the leaf switch */
1564                         for (i = 0; i < p_sw->down_port_groups_num; i++) {
1565                                 /* Work with port groups that are pointing to switches only.
1566                                    No need to assign indexing to HCAs */
1567                                 if (p_sw->
1568                                     down_port_groups[i]->remote_node_type !=
1569                                     IB_NODE_TYPE_SWITCH)
1570                                         continue;
1571
1572                                 p_remote_sw =
1573                                     p_sw->down_port_groups[i]->
1574                                     remote_hca_or_sw.p_sw;
1575                                 if (tuple_assigned(p_remote_sw->tuple)) {
1576                                         /* this switch has been already indexed */
1577                                         continue;
1578                                 }
1579                                 /* allocate new tuple */
1580                                 fabric_get_new_tuple(p_ftree, new_tuple,
1581                                                      p_sw->tuple,
1582                                                      FTREE_DIRECTION_DOWN);
1583                                 /* Assign the new tuple to the remote switch.
1584                                    This fuction also adds the switch into the switch_by_tuple table. */
1585                                 fabric_assign_tuple(p_ftree, p_remote_sw,
1586                                                     new_tuple);
1587
1588                                 /* add the newly discovered switch to the BFS queue */
1589                                 cl_list_insert_tail(&bfs_list, p_remote_sw);
1590                         }
1591                         /* Done assigning indexes to all the remote switches
1592                            that are pointed by the downgoing ports.
1593                            Now sort port groups according to remote index. */
1594                         qsort(p_sw->down_port_groups,   /* array */
1595                               p_sw->down_port_groups_num,       /* number of elements */
1596                               sizeof(ftree_port_group_t *),     /* size of each element */
1597                               compare_port_groups_by_remote_switch_index);      /* comparator */
1598                 }
1599
1600                 /* Done indexing switches from ports that go down.
1601                    Now do the same with ports that are pointing up.
1602                    if we started from root (rank == 0), the leaf is bsf termination point */
1603
1604                 if (p_sw->rank != 0 && (p_first_sw->rank != 0 || !p_sw->is_leaf)) {
1605                         /* This is not the root switch, which means that all the ports
1606                            that are pointing up are taking us to another switches. */
1607                         for (i = 0; i < p_sw->up_port_groups_num; i++) {
1608                                 p_remote_sw =
1609                                     p_sw->up_port_groups[i]->
1610                                     remote_hca_or_sw.p_sw;
1611                                 if (tuple_assigned(p_remote_sw->tuple))
1612                                         continue;
1613                                 /* allocate new tuple */
1614                                 fabric_get_new_tuple(p_ftree, new_tuple,
1615                                                      p_sw->tuple,
1616                                                      FTREE_DIRECTION_UP);
1617                                 /* Assign the new tuple to the remote switch.
1618                                    This fuction also adds the switch to the
1619                                    switch_by_tuple table. */
1620                                 fabric_assign_tuple(p_ftree,
1621                                                     p_remote_sw, new_tuple);
1622                                 /* add the newly discovered switch to the BFS queue */
1623                                 cl_list_insert_tail(&bfs_list, p_remote_sw);
1624                         }
1625                         /* Done assigning indexes to all the remote switches
1626                            that are pointed by the upgoing ports.
1627                            Now sort port groups according to remote index. */
1628                         qsort(p_sw->up_port_groups,     /* array */
1629                               p_sw->up_port_groups_num, /* number of elements */
1630                               sizeof(ftree_port_group_t *),     /* size of each element */
1631                               compare_port_groups_by_remote_switch_index);      /* comparator */
1632                 }
1633                 /* Done assigning indexes to all the switches that are directly connected
1634                    to the current switch - go to the next switch in the BFS queue */
1635         }
1636         cl_list_destroy(&bfs_list);
1637
1638         OSM_LOG_EXIT(&p_ftree->p_osm->log);
1639 }
1640
1641 static void fabric_make_indexing(IN ftree_fabric_t * p_ftree)
1642 {
1643         ftree_sw_t *p_sw = NULL;
1644         unsigned int subtree = 0;
1645         OSM_LOG_ENTER(&p_ftree->p_osm->log);
1646
1647         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1648                 "Starting FatTree indexing\n");
1649
1650         /* using the first switch as a starting point for indexing algorithm. */
1651         for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1652              p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1653              p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1654                 if (ftree_get_subnet(p_ftree)->opt.quasi_ftree_indexing) {
1655                         /* find first root switch */
1656                         if (p_sw->rank != 0)
1657                                 continue;
1658                 } else {
1659                         /* find first leaf switch */
1660                         if (!p_sw->is_leaf)
1661                                 continue;
1662                 }
1663                 /* Assign the first tuple to the switch that is used as BFS starting point
1664                    in the subtree.
1665                    The tuple will be as follows: [rank].0...0.subtree
1666                    This fuction also adds the switch it into the switch_by_tuple table. */
1667                 if (!tuple_assigned(p_sw->tuple)) {
1668                         fabric_assign_first_tuple(p_ftree, p_sw, subtree++);
1669                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1670                         "Indexing starting point:\n"
1671                         "                                            - Switch rank  : %u\n"
1672                         "                                            - Switch index : %s\n"
1673                         "                                            - Node LID     : %u\n"
1674                         "                                            - Node GUID    : 0x%016"
1675                         PRIx64 "\n", p_sw->rank, tuple_to_str(p_sw->tuple),
1676                         p_sw->lid, sw_get_guid_ho(p_sw));
1677                 }
1678
1679                 bfs_fabric_indexing(p_ftree, p_sw);
1680
1681                 if (ftree_get_subnet(p_ftree)->opt.quasi_ftree_indexing == FALSE)
1682                         goto Exit;
1683         }
1684         p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1685         while (p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1686                 if (p_sw->is_leaf) {
1687                         qsort(p_sw->up_port_groups,     /* array */
1688                               p_sw->up_port_groups_num, /* number of elements */
1689                               sizeof(ftree_port_group_t *),     /* size of each element */
1690                               compare_port_groups_by_remote_switch_index);      /* comparator */
1691                 }
1692                 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1693
1694         }
1695 Exit:
1696         OSM_LOG_EXIT(&p_ftree->p_osm->log);
1697 }                               /* fabric_make_indexing() */
1698 /***************************************************/
1699
1700 static int fabric_create_leaf_switch_array(IN ftree_fabric_t * p_ftree)
1701 {
1702         ftree_sw_t *p_sw;
1703         ftree_sw_t *p_next_sw;
1704         ftree_sw_t **all_switches_at_leaf_level;
1705         unsigned i;
1706         unsigned all_leaf_idx = 0;
1707         unsigned first_leaf_idx;
1708         unsigned last_leaf_idx;
1709         int res = 0;
1710
1711         OSM_LOG_ENTER(&p_ftree->p_osm->log);
1712
1713         /* create array of ALL the switches that have leaf rank */
1714         all_switches_at_leaf_level = (ftree_sw_t **)
1715             malloc(cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *));
1716         if (!all_switches_at_leaf_level) {
1717                 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_SYS, FILE_ID,
1718                            "Fat-tree routing: Memory allocation failed\n");
1719                 res = -1;
1720                 goto Exit;
1721         }
1722         memset(all_switches_at_leaf_level, 0,
1723                cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *));
1724
1725         p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1726         while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1727                 p_sw = p_next_sw;
1728                 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1729                 if (p_sw->rank == p_ftree->leaf_switch_rank) {
1730                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1731                                 "Adding switch 0x%" PRIx64
1732                                 " to full leaf switch array\n",
1733                                 sw_get_guid_ho(p_sw));
1734                         all_switches_at_leaf_level[all_leaf_idx++] = p_sw;
1735                 }
1736         }
1737
1738         /* quick-sort array of leaf switches by index */
1739         qsort(all_switches_at_leaf_level,       /* array */
1740               all_leaf_idx,     /* number of elements */
1741               sizeof(ftree_sw_t *),     /* size of each element */
1742               compare_switches_by_index);       /* comparator */
1743
1744         /* check the first and the last REAL leaf (the one
1745            that has CNs) in the array of all the leafs */
1746
1747         first_leaf_idx = all_leaf_idx;
1748         last_leaf_idx = 0;
1749         for (i = 0; i < all_leaf_idx; i++) {
1750                 if (all_switches_at_leaf_level[i]->is_leaf) {
1751                         if (i < first_leaf_idx)
1752                                 first_leaf_idx = i;
1753                         last_leaf_idx = i;
1754                 }
1755         }
1756
1757         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1758                 "Full leaf array info: first_leaf_idx = %u, last_leaf_idx = %u\n",
1759                 first_leaf_idx, last_leaf_idx);
1760
1761         if (first_leaf_idx >= last_leaf_idx) {
1762                 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
1763                            "Failed to find leaf switches - topology is not "
1764                            "fat-tree\n");
1765                 res = -1;
1766                 goto Exit;
1767         }
1768
1769         /* Create array of REAL leaf switches, sorted by index.
1770            This array may contain switches at the same rank w/o CNs,
1771            in case this is the order of indexing. */
1772         p_ftree->leaf_switches_num = last_leaf_idx - first_leaf_idx + 1;
1773         p_ftree->leaf_switches = (ftree_sw_t **)
1774             malloc(p_ftree->leaf_switches_num * sizeof(ftree_sw_t *));
1775         if (!p_ftree->leaf_switches) {
1776                 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_SYS, FILE_ID,
1777                            "Fat-tree routing: Memory allocation failed\n");
1778                 res = -1;
1779                 goto Exit;
1780         }
1781
1782         memcpy(p_ftree->leaf_switches,
1783                &(all_switches_at_leaf_level[first_leaf_idx]),
1784                p_ftree->leaf_switches_num * sizeof(ftree_sw_t *));
1785
1786         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1787                 "Created array of %u leaf switches\n",
1788                 p_ftree->leaf_switches_num);
1789
1790 Exit:
1791         free(all_switches_at_leaf_level);
1792         OSM_LOG_EXIT(&p_ftree->p_osm->log);
1793         return res;
1794 }                               /* fabric_create_leaf_switch_array() */
1795
1796 /***************************************************/
1797
1798 static void fabric_set_max_cn_per_leaf(IN ftree_fabric_t * p_ftree)
1799 {
1800         unsigned i;
1801         unsigned j;
1802         unsigned cns_on_this_leaf;
1803         ftree_sw_t *p_sw;
1804         ftree_port_group_t *p_group, *p_up_group;
1805         ftree_hca_t *p_hca;
1806
1807         for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1808                 p_sw = p_ftree->leaf_switches[i];
1809                 cns_on_this_leaf = 0;
1810                 for (j = 0; j < p_sw->down_port_groups_num; j++) {
1811                         p_group = p_sw->down_port_groups[j];
1812                         if (p_group->remote_node_type != IB_NODE_TYPE_CA)
1813                                 continue;
1814                         p_hca = p_group->remote_hca_or_sw.p_hca;
1815                         /*
1816                          * Get the hca port group corresponding
1817                          * to the LID of remote HCA port
1818                          */
1819                         p_up_group = hca_get_port_group_by_lid(p_hca,
1820                                      p_group->remote_lid);
1821
1822                         CL_ASSERT(p_up_group);
1823
1824                         if (p_up_group->is_cn)
1825                                 cns_on_this_leaf++;
1826                 }
1827                 if (cns_on_this_leaf > p_ftree->max_cn_per_leaf)
1828                         p_ftree->max_cn_per_leaf = cns_on_this_leaf;
1829         }
1830 }                               /* fabric_set_max_cn_per_leaf() */
1831
1832 /***************************************************/
1833
1834 static boolean_t fabric_validate_topology(IN ftree_fabric_t * p_ftree)
1835 {
1836         ftree_port_group_t *p_group;
1837         ftree_port_group_t *p_ref_group;
1838         ftree_sw_t *p_sw;
1839         ftree_sw_t *p_next_sw;
1840         ftree_sw_t **reference_sw_arr;
1841         uint16_t tree_rank = fabric_get_rank(p_ftree);
1842         boolean_t res = TRUE;
1843         uint8_t i;
1844
1845         OSM_LOG_ENTER(&p_ftree->p_osm->log);
1846
1847         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1848                 "Validating fabric topology\n");
1849
1850         reference_sw_arr =
1851             (ftree_sw_t **) malloc(tree_rank * sizeof(ftree_sw_t *));
1852         if (reference_sw_arr == NULL) {
1853                 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_SYS, FILE_ID,
1854                            "Fat-tree routing: Memory allocation failed\n");
1855                 return FALSE;
1856         }
1857         memset(reference_sw_arr, 0, tree_rank * sizeof(ftree_sw_t *));
1858
1859         p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1860         while (res && p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1861                 p_sw = p_next_sw;
1862                 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1863
1864                 if (!reference_sw_arr[p_sw->rank])
1865                         /* This is the first switch in the current level that
1866                            we're checking - use it as a reference */
1867                         reference_sw_arr[p_sw->rank] = p_sw;
1868                 else {
1869                         /* compare this switch properties to the reference switch */
1870
1871                         if (reference_sw_arr[p_sw->rank]->up_port_groups_num !=
1872                             p_sw->up_port_groups_num) {
1873                                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1874                                         "ERR AB09: Different number of upward port groups on switches:\n"
1875                                         "       GUID 0x%016" PRIx64
1876                                         ", LID %u, Index %s - %u groups\n"
1877                                         "       GUID 0x%016" PRIx64
1878                                         ", LID %u, Index %s - %u groups\n",
1879                                         sw_get_guid_ho
1880                                         (reference_sw_arr[p_sw->rank]),
1881                                         reference_sw_arr[p_sw->rank]->lid,
1882                                         tuple_to_str
1883                                         (reference_sw_arr[p_sw->rank]->tuple),
1884                                         reference_sw_arr[p_sw->
1885                                                          rank]->
1886                                         up_port_groups_num,
1887                                         sw_get_guid_ho(p_sw), p_sw->lid,
1888                                         tuple_to_str(p_sw->tuple),
1889                                         p_sw->up_port_groups_num);
1890                                 res = FALSE;
1891                                 break;
1892                         }
1893
1894                         if (p_sw->rank != (tree_rank - 1) &&
1895                             reference_sw_arr[p_sw->
1896                                              rank]->down_port_groups_num !=
1897                             p_sw->down_port_groups_num) {
1898                                 /* we're allowing some hca's to be missing */
1899                                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1900                                         "ERR AB0A: Different number of downward port groups on switches:\n"
1901                                         "       GUID 0x%016" PRIx64
1902                                         ", LID %u, Index %s - %u port groups\n"
1903                                         "       GUID 0x%016" PRIx64
1904                                         ", LID %u, Index %s - %u port groups\n",
1905                                         sw_get_guid_ho
1906                                         (reference_sw_arr[p_sw->rank]),
1907                                         reference_sw_arr[p_sw->rank]->lid,
1908                                         tuple_to_str
1909                                         (reference_sw_arr[p_sw->rank]->tuple),
1910                                         reference_sw_arr[p_sw->
1911                                                          rank]->
1912                                         down_port_groups_num,
1913                                         sw_get_guid_ho(p_sw), p_sw->lid,
1914                                         tuple_to_str(p_sw->tuple),
1915                                         p_sw->down_port_groups_num);
1916                                 res = FALSE;
1917                                 break;
1918                         }
1919
1920                         if (reference_sw_arr[p_sw->rank]->up_port_groups_num !=
1921                             0) {
1922                                 p_ref_group =
1923                                     reference_sw_arr[p_sw->
1924                                                      rank]->up_port_groups[0];
1925                                 for (i = 0; i < p_sw->up_port_groups_num; i++) {
1926                                         p_group = p_sw->up_port_groups[i];
1927                                         if (cl_ptr_vector_get_size
1928                                             (&p_ref_group->ports) !=
1929                                             cl_ptr_vector_get_size
1930                                             (&p_group->ports)) {
1931                                                 OSM_LOG(&p_ftree->p_osm->log,
1932                                                         OSM_LOG_ERROR,
1933                                                         "ERR AB0B: Different number of ports in an upward port group on switches:\n"
1934                                                         "       GUID 0x%016"
1935                                                         PRIx64
1936                                                         ", LID %u, Index %s - %u ports\n"
1937                                                         "       GUID 0x%016"
1938                                                         PRIx64
1939                                                         ", LID %u, Index %s - %u ports\n",
1940                                                         sw_get_guid_ho
1941                                                         (reference_sw_arr
1942                                                          [p_sw->rank]),
1943                                                         reference_sw_arr[p_sw->
1944                                                                          rank]->
1945                                                         lid,
1946                                                         tuple_to_str
1947                                                         (reference_sw_arr
1948                                                          [p_sw->rank]->tuple),
1949                                                         cl_ptr_vector_get_size
1950                                                         (&p_ref_group->ports),
1951                                                         sw_get_guid_ho(p_sw),
1952                                                         p_sw->lid,
1953                                                         tuple_to_str(p_sw->
1954                                                                      tuple),
1955                                                         cl_ptr_vector_get_size
1956                                                         (&p_group->ports));
1957                                                 res = FALSE;
1958                                                 break;
1959                                         }
1960                                 }
1961                         }
1962                         if (reference_sw_arr[p_sw->rank]->down_port_groups_num
1963                             != 0 && p_sw->rank != (tree_rank - 1)) {
1964                                 /* we're allowing some hca's to be missing */
1965                                 p_ref_group =
1966                                     reference_sw_arr[p_sw->
1967                                                      rank]->down_port_groups[0];
1968                                 for (i = 0; i < p_sw->down_port_groups_num; i++) {
1969                                         p_group = p_sw->down_port_groups[0];
1970                                         if (cl_ptr_vector_get_size
1971                                             (&p_ref_group->ports) !=
1972                                             cl_ptr_vector_get_size
1973                                             (&p_group->ports)) {
1974                                                 OSM_LOG(&p_ftree->p_osm->log,
1975                                                         OSM_LOG_ERROR,
1976                                                         "ERR AB0C: Different number of ports in an downward port group on switches:\n"
1977                                                         "       GUID 0x%016"
1978                                                         PRIx64
1979                                                         ", LID %u, Index %s - %u ports\n"
1980                                                         "       GUID 0x%016"
1981                                                         PRIx64
1982                                                         ", LID %u, Index %s - %u ports\n",
1983                                                         sw_get_guid_ho
1984                                                         (reference_sw_arr
1985                                                          [p_sw->rank]),
1986                                                         reference_sw_arr[p_sw->
1987                                                                          rank]->
1988                                                         lid,
1989                                                         tuple_to_str
1990                                                         (reference_sw_arr
1991                                                          [p_sw->rank]->tuple),
1992                                                         cl_ptr_vector_get_size
1993                                                         (&p_ref_group->ports),
1994                                                         sw_get_guid_ho(p_sw),
1995                                                         p_sw->lid,
1996                                                         tuple_to_str(p_sw->
1997                                                                      tuple),
1998                                                         cl_ptr_vector_get_size
1999                                                         (&p_group->ports));
2000                                                 res = FALSE;
2001                                                 break;
2002                                         }
2003                                 }
2004                         }
2005                 }               /* end of else */
2006         }                       /* end of while */
2007
2008         if (res == TRUE)
2009                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
2010                         "Fabric topology has been identified as FatTree\n");
2011         else
2012                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
2013                         "ERR AB0D: Fabric topology hasn't been identified as FatTree\n");
2014
2015         free(reference_sw_arr);
2016         OSM_LOG_EXIT(&p_ftree->p_osm->log);
2017         return res;
2018 }                               /* fabric_validate_topology() */
2019
2020 /***************************************************
2021  ***************************************************/
2022
2023 static void set_sw_fwd_table(IN cl_map_item_t * const p_map_item,
2024                              IN void *context)
2025 {
2026         ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item;
2027         ftree_fabric_t *p_ftree = (ftree_fabric_t *) context;
2028
2029         p_sw->p_osm_sw->max_lid_ho = p_ftree->lft_max_lid;
2030 }
2031
2032 /***************************************************
2033  ***************************************************/
2034
2035 /*
2036  * Function: Finds the least loaded port group and stores its counter
2037  * Given   : A switch
2038  */
2039 static inline void recalculate_min_counter_down(ftree_sw_t * p_sw)
2040 {
2041         uint32_t min = (1 << 30);
2042         uint32_t i;
2043         for (i = 0; i < p_sw->down_port_groups_num; i++) {
2044                 if (p_sw->down_port_groups[i]->counter_down < min) {
2045                         min = p_sw->down_port_groups[i]->counter_down;
2046                 }
2047         }
2048         p_sw->min_counter_down = min;
2049         return;
2050 }
2051
2052 /*
2053  * Function: Return the counter value of the least loaded down port group
2054  * Given   : A switch
2055  */
2056 static inline uint32_t find_lowest_loaded_group_on_sw(ftree_sw_t * p_sw)
2057 {
2058         return p_sw->min_counter_down;
2059 }
2060
2061 /*
2062  * Function: Compare the load of two port groups and return which is the least loaded
2063  * Given   : Two port groups with remote switch
2064  * When both port groups are equally loaded, it picks the one whom
2065  * remote switch down ports are least loaded.
2066  * This way, it prefers the switch from where it will be easier to go down (creating upward routes).
2067  * If both are equal, it picks the lowest INDEX to be deterministic.
2068  */
2069 static inline int port_group_compare_load_down(const ftree_port_group_t * p1,
2070                                                const ftree_port_group_t * p2)
2071 {
2072         int temp = p1->counter_down - p2->counter_down;
2073         if (temp > 0)
2074                 return 1;
2075         if (temp < 0)
2076                 return -1;
2077
2078         /* Find the less loaded remote sw and choose this one */
2079         do {
2080                 uint32_t load1 =
2081                     find_lowest_loaded_group_on_sw(p1->remote_hca_or_sw.p_sw);
2082                 uint32_t load2 =
2083                     find_lowest_loaded_group_on_sw(p2->remote_hca_or_sw.p_sw);
2084                 temp = load1 - load2;
2085                 if (temp > 0)
2086                         return 1;
2087         } while (0);
2088         /* If they are both equal, choose the lowest index */
2089         return compare_port_groups_by_remote_switch_index(&p1, &p2);
2090 }
2091
2092 static inline int port_group_compare_load_up(const ftree_port_group_t * p1,
2093                                              const ftree_port_group_t * p2)
2094 {
2095         int temp = p1->counter_up - p2->counter_up;
2096         if (temp > 0)
2097                 return 1;
2098         if (temp < 0)
2099                 return -1;
2100
2101         /* If they are both equal, choose the lowest index */
2102         return compare_port_groups_by_remote_switch_index (&p1,&p2);
2103 }
2104
2105 /*
2106  * Function: Sorts an array of port group by up load order
2107  * Given   : A port group array and its length
2108  * As the list is mostly sorted, we used a bubble sort instead of qsort
2109  * as it is much faster.
2110  *
2111  * Important note:
2112  * This function and bubble_sort_down must NOT be factorized.
2113  * Although most of the code is the same and a function pointer could be used
2114  * for the compareason function, it would prevent the compareason function to be inlined
2115  * and cost a great deal to performances.
2116  */
2117 static inline void
2118 bubble_sort_up(ftree_port_group_t ** p_group_array, uint32_t nmemb)
2119 {
2120         uint32_t i = 0;
2121         uint32_t j = 0;
2122         ftree_port_group_t *tmp = p_group_array[0];
2123
2124         /* As this function is a great number of times, we only go into the loop
2125          * if one of the port counters has changed, thus saving some tests */
2126         if (tmp->hca_or_sw.p_sw->counter_up_changed == FALSE) {
2127                 return;
2128         }
2129         /* While we did modifications on the array order */
2130         /* i may grew above array length but next loop will fail and tmp will be null for the next time
2131          * this way we save a test i < nmemb for each pass through the loop */
2132         for (i = 0; tmp; i++) {
2133                 /* Assume the array is orderd */
2134                 tmp = NULL;
2135                 /* Comparing elements j and j-1 */
2136                 for (j = 1; j < (nmemb - i); j++) {
2137                         /* If they are the wrong way around */
2138                         if (port_group_compare_load_up(p_group_array[j],
2139                                                        p_group_array[j - 1]) < 0) {
2140                                 /* We invert them */
2141                                 tmp = p_group_array[j - 1];
2142                                 p_group_array[j - 1] = p_group_array[j];
2143                                 p_group_array[j] = tmp;
2144                                 /* This sets tmp != NULL so the main loop will make another pass */
2145                         }
2146                 }
2147         }
2148
2149         /* We have reordered the array so as long noone changes the counter
2150          * it's not necessary to do it again */
2151         p_group_array[0]->hca_or_sw.p_sw->counter_up_changed = FALSE;
2152 }
2153
2154 static inline void
2155 bubble_sort_siblings(ftree_port_group_t ** p_group_array, uint32_t nmemb)
2156 {
2157         uint32_t i = 0;
2158         uint32_t j = 0;
2159         ftree_port_group_t *tmp = p_group_array[0];
2160
2161         /* While we did modifications on the array order */
2162         /* i may grew above array length but next loop will fail and tmp will be null for the next time
2163          * this way we save a test i < nmemb for each pass through the loop */
2164         for (i = 0; tmp != NULL; i++) {
2165                 /* Assume the array is orderd */
2166                 tmp = NULL;
2167                 /* Comparing elements j and j-1 */
2168                 for (j = 1; j < (nmemb - i); j++) {
2169                         /* If they are the wrong way around */
2170                         if (port_group_compare_load_up(p_group_array[j],
2171                                                        p_group_array[j - 1]) < 0) {
2172                                 /* We invert them */
2173                                 tmp = p_group_array[j - 1];
2174                                 p_group_array[j - 1] = p_group_array[j];
2175                                 p_group_array[j] = tmp;
2176                         }
2177                 }
2178         }
2179 }
2180
2181 /*
2182  * Function: Sorts an array of port group. Order is decide through
2183  * port_group_compare_load_down ( up counters, least load remote switch, biggest GUID)
2184  * Given   : A port group array and its length. Each port group points to a remote switch (not a HCA)
2185  * As the list is mostly sorted, we used a bubble sort instead of qsort
2186  * as it is much faster.
2187  *
2188  * Important note:
2189  * This function and bubble_sort_up must NOT be factorized.
2190  * Although most of the code is the same and a function pointer could be used
2191  * for the compareason function, it would prevent the compareason function to be inlined
2192  * and cost a great deal to performances.
2193  */
2194 static inline void
2195 bubble_sort_down(ftree_port_group_t ** p_group_array, uint32_t nmemb)
2196 {
2197         uint32_t i = 0;
2198         uint32_t j = 0;
2199         ftree_port_group_t *tmp = p_group_array[0];
2200
2201         /* While we did modifications on the array order */
2202         /* i may grew above array length but next loop will fail and tmp will be null for the next time
2203          * this way we save a test i < nmemb for each pass through the loop */
2204         for (i = 0; tmp; i++) {
2205                 /* Assume the array is orderd */
2206                 tmp = NULL;
2207                 /* Comparing elements j and j-1 */
2208                 for (j = 1; j < (nmemb - i); j++) {
2209                         /* If they are the wrong way around */
2210                         if (port_group_compare_load_down
2211                             (p_group_array[j], p_group_array[j - 1]) < 0) {
2212                                 /* We invert them */
2213                                 tmp = p_group_array[j - 1];
2214                                 p_group_array[j - 1] = p_group_array[j];
2215                                 p_group_array[j] = tmp;
2216
2217                         }
2218                 }
2219         }
2220 }
2221
2222 /***************************************************
2223  ***************************************************/
2224
2225 /*
2226  * Function: assign-up-going-port-by-descending-down
2227  * Given   : a switch and a LID
2228  * Pseudo code:
2229  *    foreach down-going-port-group (in indexing order)
2230  *        skip this group if the LFT(LID) port is part of this group
2231  *        find the least loaded port of the group (scan in indexing order)
2232  *        r-port is the remote port connected to it
2233  *        assign the remote switch node LFT(LID) to r-port
2234  *        increase r-port usage counter
2235  *        assign-up-going-port-by-descending-down to r-port node (recursion)
2236  */
2237
2238 static boolean_t
2239 fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
2240                                    IN ftree_sw_t * p_sw,
2241                                    IN ftree_sw_t * p_prev_sw,
2242                                    IN uint16_t target_lid,
2243                                    IN boolean_t is_main_path,
2244                                    IN boolean_t is_target_a_sw,
2245                                    IN uint8_t current_hops)
2246 {
2247         ftree_sw_t *p_remote_sw;
2248         uint16_t ports_num;
2249         ftree_port_group_t *p_group;
2250         ftree_port_t *p_port;
2251         ftree_port_t *p_min_port;
2252         uint16_t j;
2253         uint16_t k;
2254         boolean_t created_route = FALSE;
2255         boolean_t routed = 0;
2256         uint8_t least_hops;
2257
2258         /* if there is no down-going ports */
2259         if (p_sw->down_port_groups_num == 0)
2260                 return FALSE;
2261
2262         /* foreach down-going port group (in load order) */
2263         bubble_sort_up(p_sw->down_port_groups, p_sw->down_port_groups_num);
2264
2265         if (p_sw->sibling_port_groups_num > 0)
2266                 bubble_sort_siblings(p_sw->sibling_port_groups,
2267                                      p_sw->sibling_port_groups_num);
2268
2269         for (k = 0;
2270              k <
2271              (p_sw->down_port_groups_num +
2272               ((target_lid != 0) ? p_sw->sibling_port_groups_num : 0)); k++) {
2273
2274                 if (k < p_sw->down_port_groups_num) {
2275                         p_group = p_sw->down_port_groups[k];
2276                 } else {
2277                         p_group =
2278                             p_sw->sibling_port_groups[k -
2279                                                       p_sw->
2280                                                       down_port_groups_num];
2281                 }
2282
2283                 /* If this port group doesn't point to a switch, mark
2284                    that the route was created and skip to the next group */
2285                 if (p_group->remote_node_type != IB_NODE_TYPE_SWITCH) {
2286                         created_route = TRUE;
2287                         continue;
2288                 }
2289
2290                 if (p_prev_sw
2291                     && p_group->remote_lid == p_prev_sw->lid) {
2292                         /* This port group has a port that was used when we entered this switch,
2293                            which means that the current group points to the switch where we were
2294                            at the previous step of the algorithm (before going up).
2295                            Skipping this group. */
2296                         continue;
2297                 }
2298
2299                 /* find the least loaded port of the group (in indexing order) */
2300                 p_min_port = NULL;
2301                 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
2302                 if(ports_num == 0)
2303                         continue;
2304
2305                 for (j = 0; j < ports_num; j++) {
2306                         cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
2307                         /* first port that we're checking - set as port with the lowest load */
2308                         /* or this port is less loaded - use it as min */
2309                         if (!p_min_port ||
2310                             p_port->counter_up < p_min_port->counter_up)
2311                                 p_min_port = p_port;
2312                 }
2313                 /* At this point we have selected a port in this group with the
2314                    lowest load of upgoing routes.
2315                    Set on the remote switch how to get to the target_lid -
2316                    set LFT(target_lid) on the remote switch to the remote port */
2317                 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2318                 least_hops = sw_get_least_hops(p_remote_sw, target_lid);
2319
2320                 if (least_hops != OSM_NO_PATH) {
2321                         /* Loop in the fabric - we already routed the remote switch
2322                            on our way UP, and now we see it again on our way DOWN */
2323                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2324                                 "Loop of length %d in the fabric:\n                             "
2325                                 "Switch %s (LID %u) closes loop through switch %s (LID %u)\n",
2326                                 current_hops,
2327                                 tuple_to_str(p_remote_sw->tuple),
2328                                 p_group->lid,
2329                                 tuple_to_str(p_sw->tuple),
2330                                 p_group->remote_lid);
2331                         /* We skip only if we have come through a longer path */
2332                         if (current_hops + 1 >= least_hops)
2333                                 continue;
2334                 }
2335
2336                 /* Four possible cases:
2337                  *
2338                  *  1. is_main_path == TRUE:
2339                  *      - going DOWN(TRUE,TRUE) through ALL the groups
2340                  *         + promoting port counter
2341                  *         + setting path in remote switch fwd tbl
2342                  *         + setting hops in remote switch on all the ports of each group
2343                  *
2344                  *  2. is_main_path == FALSE:
2345                  *      - going DOWN(TRUE,FALSE) through ALL the groups but only if
2346                  *        the remote (lower) switch hasn't been already configured
2347                  *        for this target LID (or with a longer path)
2348                  *         + promoting port counter
2349                  *         + setting path in remote switch fwd tbl if it hasn't been set yet
2350                  *         + setting hops in remote switch on all the ports of each group
2351                  *           if it hasn't been set yet
2352                  */
2353
2354                 /* setting fwd tbl port only */
2355                 p_remote_sw->p_osm_sw->new_lft[target_lid] =
2356                             p_min_port->remote_port_num;
2357                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2358                                 "Switch %s: set path to CA LID %u through port %u\n",
2359                                 tuple_to_str(p_remote_sw->tuple),
2360                                 target_lid, p_min_port->remote_port_num);
2361
2362                 /* On the remote switch that is pointed by the p_group,
2363                         set hops for ALL the ports in the remote group. */
2364
2365                 set_hops_on_remote_sw(p_group, target_lid,
2366                                       current_hops + 1, is_target_a_sw);
2367
2368                 /* Recursion step:
2369                    Assign upgoing ports by stepping down, starting on REMOTE switch */
2370                 routed = fabric_route_upgoing_by_going_down(p_ftree, p_remote_sw,       /* remote switch - used as a route-upgoing alg. start point */
2371                                                             NULL,       /* prev. position - NULL to mark that we went down and not up */
2372                                                             target_lid, /* LID that we're routing to */
2373                                                             is_main_path,       /* whether this is path to HCA that should by tracked by counters */
2374                                                             is_target_a_sw,     /* Whether target lid is a switch or not */
2375                                                             current_hops + 1);  /* Number of hops done to this point */
2376                 created_route |= routed;
2377                 /* Counters are promoted only if a route toward a node is created */
2378                 if (routed) {
2379                         p_min_port->counter_up++;
2380                         p_group->counter_up++;
2381                         p_group->hca_or_sw.p_sw->counter_up_changed = TRUE;
2382                 }
2383         }
2384         /* done scanning all the down-going port groups */
2385
2386         /* if the route was created, promote the index that
2387            indicates which group should we start with when
2388            going through all the downgoing groups */
2389         if (created_route)
2390                 p_sw->down_port_groups_idx = (p_sw->down_port_groups_idx + 1)
2391                     % p_sw->down_port_groups_num;
2392
2393         return created_route;
2394 }                               /* fabric_route_upgoing_by_going_down() */
2395
2396 /***************************************************/
2397
2398 /*
2399  * Function: assign-down-going-port-by-ascending-up
2400  * Given   : a switch and a LID
2401  * Pseudo code:
2402  *    find the least loaded port of all the upgoing groups (scan in indexing order)
2403  *    assign the LFT(LID) of remote switch to that port
2404  *    track that port usage
2405  *    assign-up-going-port-by-descending-down on CURRENT switch
2406  *    assign-down-going-port-by-ascending-up on REMOTE switch (recursion)
2407  */
2408
2409 static boolean_t
2410 fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
2411                                    IN ftree_sw_t * p_sw,
2412                                    IN ftree_sw_t * p_prev_sw,
2413                                    IN uint16_t target_lid,
2414                                    IN boolean_t is_main_path,
2415                                    IN boolean_t is_target_a_sw,
2416                                    IN uint16_t reverse_hop_credit,
2417                                    IN uint16_t reverse_hops,
2418                                    IN uint8_t current_hops)
2419 {
2420         ftree_sw_t *p_remote_sw;
2421         uint16_t ports_num;
2422         ftree_port_group_t *p_group;
2423         ftree_port_t *p_port;
2424         ftree_port_group_t *p_min_group;
2425         ftree_port_t *p_min_port;
2426         uint16_t i;
2427         uint16_t j;
2428         boolean_t created_route = FALSE;
2429         boolean_t routed = FALSE;
2430
2431
2432         /* Assign upgoing ports by stepping down, starting on THIS switch */
2433         created_route = fabric_route_upgoing_by_going_down(p_ftree, p_sw,       /* local switch - used as a route-upgoing alg. start point */
2434                                                            p_prev_sw,   /* switch that we went up from (NULL means that we went down) */
2435                                                            target_lid,  /* LID that we're routing to */
2436                                                            is_main_path,        /* whether this path to HCA should by tracked by counters */
2437                                                            is_target_a_sw,      /* Whether target lid is a switch or not */
2438                                                            current_hops);       /* Number of hops done up to this point */
2439
2440         /* recursion stop condition - if it's a root switch, */
2441         if (p_sw->rank == 0) {
2442                 if (reverse_hop_credit > 0) {
2443                         /* We go up by going down as we have some reverse_hop_credit left */
2444                         /* We use the index to scatter a bit the reverse up routes */
2445                         p_sw->down_port_groups_idx =
2446                             (p_sw->down_port_groups_idx +
2447                              1) % p_sw->down_port_groups_num;
2448                         i = p_sw->down_port_groups_idx;
2449                         for (j = 0; j < p_sw->down_port_groups_num; j++) {
2450
2451                                 p_group = p_sw->down_port_groups[i];
2452                                 i = (i + 1) % p_sw->down_port_groups_num;
2453
2454                                 /* Skip this port group unless it points to a switch */
2455                                 if (p_group->remote_node_type !=
2456                                     IB_NODE_TYPE_SWITCH)
2457                                         continue;
2458                                 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2459
2460                                 created_route |= fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,       /* remote switch - used as a route-downgoing alg. next step point */
2461                                                                                     p_sw,       /* this switch - prev. position switch for the function */
2462                                                                                     target_lid, /* LID that we're routing to */
2463                                                                                     is_main_path,       /* whether this is path to HCA that should by tracked by counters */
2464                                                                                     is_target_a_sw,     /* Whether target lid is a switch or not */
2465                                                                                     reverse_hop_credit - 1,     /* Remaining reverse_hops allowed */
2466                                                                                     reverse_hops + 1,   /* Number of reverse_hops done up to this point */
2467                                                                                     current_hops
2468                                                                                     +
2469                                                                                     1);
2470                         }
2471
2472                 }
2473                 return created_route;
2474         }
2475
2476         /* We should generate a list of port sorted by load so we can find easily the least
2477          * going port and explore the other pots on secondary routes more easily (and quickly) */
2478         bubble_sort_down(p_sw->up_port_groups, p_sw->up_port_groups_num);
2479
2480         p_min_group = p_sw->up_port_groups[0];
2481         /* Find the least loaded upgoing port in the selected group */
2482         p_min_port = NULL;
2483         ports_num = (uint16_t) cl_ptr_vector_get_size(&p_min_group->ports);
2484         for (j = 0; j < ports_num; j++) {
2485                 cl_ptr_vector_at(&p_min_group->ports, j, (void *)&p_port);
2486                 if (!p_min_port) {
2487                         /* first port that we're checking - use
2488                            it as a port with the lowest load */
2489                         p_min_port = p_port;
2490                 } else if (p_port->counter_down < p_min_port->counter_down) {
2491                         /* this port is less loaded - use it as min */
2492                         p_min_port = p_port;
2493                 }
2494         }
2495
2496         /* At this point we have selected a group and port with the
2497            lowest load of downgoing routes.
2498            Set on the remote switch how to get to the target_lid -
2499            set LFT(target_lid) on the remote switch to the remote port */
2500         p_remote_sw = p_min_group->remote_hca_or_sw.p_sw;
2501
2502         /* Four possible cases:
2503          *
2504          *  1. is_main_path == TRUE:
2505          *      - going UP(TRUE,TRUE) on selected min_group and min_port
2506          *         + promoting port counter
2507          *         + setting path in remote switch fwd tbl
2508          *         + setting hops in remote switch on all the ports of selected group
2509          *      - going UP(TRUE,FALSE) on rest of the groups, each time on port 0
2510          *         + NOT promoting port counter
2511          *         + setting path in remote switch fwd tbl if it hasn't been set yet
2512          *         + setting hops in remote switch on all the ports of each group
2513          *           if it hasn't been set yet
2514          *
2515          *  2. is_main_path == FALSE:
2516          *      - going UP(TRUE,FALSE) on ALL the groups, each time on port 0,
2517          *        but only if the remote (upper) switch hasn't been already
2518          *        configured for this target LID
2519          *         + NOT promoting port counter
2520          *         + setting path in remote switch fwd tbl if it hasn't been set yet
2521          *         + setting hops in remote switch on all the ports of each group
2522          *           if it hasn't been set yet
2523          */
2524
2525         /* covering first half of case 1, and case 3 */
2526         if (is_main_path) {
2527                 if (p_sw->is_leaf) {
2528                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2529                                 " - Routing MAIN path for %s CA LID %u: %s --> %s\n",
2530                                 (target_lid != 0) ? "real" : "DUMMY",
2531                                 target_lid,
2532                                 tuple_to_str(p_sw->tuple),
2533                                 tuple_to_str(p_remote_sw->tuple));
2534                 }
2535                 /* The number of downgoing routes is tracked in the
2536                    p_group->counter_down p_port->counter_down counters of the
2537                    group and port that belong to the lower side of the link
2538                    (on switch with higher rank) */
2539                 p_min_group->counter_down++;
2540                 p_min_port->counter_down++;
2541                 if (p_min_group->counter_down ==
2542                     (p_min_group->remote_hca_or_sw.p_sw->min_counter_down +
2543                      1)) {
2544                         recalculate_min_counter_down
2545                             (p_min_group->remote_hca_or_sw.p_sw);
2546                 }
2547
2548                 /* This LID may already be in the LFT in the reverse_hop feature is used */
2549                 /* We update the LFT only if this LID isn't already present. */
2550
2551                 /* skip if target lid has been already set on remote switch fwd tbl (with a bigger hop count) */
2552                 if ((p_remote_sw->p_osm_sw->new_lft[target_lid] == OSM_NO_PATH)
2553                     ||
2554                     (current_hops + 1 <
2555                      sw_get_least_hops(p_remote_sw, target_lid))) {
2556
2557                         p_remote_sw->p_osm_sw->new_lft[target_lid] =
2558                                 p_min_port->remote_port_num;
2559                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2560                                         "Switch %s: set path to CA LID %u through port %u\n",
2561                                         tuple_to_str(p_remote_sw->tuple),
2562                                         target_lid,
2563                                         p_min_port->remote_port_num);
2564
2565                         /* On the remote switch that is pointed by the min_group,
2566                         set hops for ALL the ports in the remote group. */
2567
2568                         set_hops_on_remote_sw(p_min_group, target_lid,
2569                                                       current_hops + 1,
2570                                                       is_target_a_sw);
2571                 }
2572         /* Recursion step: Assign downgoing ports by stepping up, starting on REMOTE switch. */
2573         created_route |= fabric_route_downgoing_by_going_up(p_ftree,
2574                                                             p_remote_sw,        /* remote switch - used as a route-downgoing alg. next step point */
2575                                                             p_sw,               /* this switch - prev. position switch for the function */
2576                                                             target_lid,         /* LID that we're routing to */
2577                                                             is_main_path,       /* whether this is path to HCA that should by tracked by counters */
2578                                                             is_target_a_sw,     /* Whether target lid is a switch or not */
2579                                                             reverse_hop_credit, /* Remaining reverse_hops allowed */
2580                                                             reverse_hops,       /* Number of reverse_hops done up to this point */
2581                                                             current_hops + 1);
2582         }
2583
2584         /* What's left to do at this point:
2585          *
2586          *  1. is_main_path == TRUE:
2587          *      - going UP(TRUE,FALSE) on rest of the groups, each time on port 0,
2588          *        but only if the remote (upper) switch hasn't been already
2589          *        configured for this target LID
2590          *         + NOT promoting port counter
2591          *         + setting path in remote switch fwd tbl if it hasn't been set yet
2592          *         + setting hops in remote switch on all the ports of each group
2593          *           if it hasn't been set yet
2594          *
2595          *  2. is_main_path == FALSE:
2596          *      - going UP(TRUE,FALSE) on ALL the groups, each time on port 0,
2597          *        but only if the remote (upper) switch hasn't been already
2598          *        configured for this target LID
2599          *         + NOT promoting port counter
2600          *         + setting path in remote switch fwd tbl if it hasn't been set yet
2601          *         + setting hops in remote switch on all the ports of each group
2602          *           if it hasn't been set yet
2603          *
2604          *  These two rules can be rephrased this way:
2605          *   - foreach UP port group
2606          *      + if remote switch has been set with the target LID
2607          *         - skip this port group
2608          *      + else
2609          *         - select port 0
2610          *         - do NOT promote port counter
2611          *         - set path in remote switch fwd tbl
2612          *         - set hops in remote switch on all the ports of this group
2613          *         - go UP(TRUE,FALSE) to the remote switch
2614          */
2615
2616         for (i = is_main_path ? 1 : 0; i < p_sw->up_port_groups_num; i++) {
2617                 p_group = p_sw->up_port_groups[i];
2618                 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2619
2620                 /* skip if target lid has been already set on remote switch fwd tbl (with a bigger hop count) */
2621                 if (p_remote_sw->p_osm_sw->new_lft[target_lid] != OSM_NO_PATH)
2622                         if (current_hops + 1 >=
2623                             sw_get_least_hops(p_remote_sw, target_lid))
2624                                 continue;
2625
2626                 if (p_sw->is_leaf) {
2627                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2628                                 " - Routing SECONDARY path for LID %u: %s --> %s\n",
2629                                 target_lid,
2630                                 tuple_to_str(p_sw->tuple),
2631                                 tuple_to_str(p_remote_sw->tuple));
2632                 }
2633
2634                 /* Routing REAL lids on SECONDARY path means routing
2635                    switch-to-switch or switch-to-CA paths.
2636                    We can safely assume that switch will initiate very
2637                    few traffic, so there's no point wasting runtime on
2638                    trying to balance these routes - always pick port 0. */
2639                 p_min_port = NULL;
2640                 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
2641                 if(ports_num == 0)
2642                         continue;
2643                 for (j = 0; j < ports_num; j++) {
2644                         cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
2645                         if (!p_min_port) {
2646                                 /* first port that we're checking - use
2647                                    it as a port with the lowest load */
2648                                 p_min_port = p_port;
2649                         } else if (p_port->counter_down <
2650                                    p_min_port->counter_down) {
2651                                 /* this port is less loaded - use it as min */
2652                                 p_min_port = p_port;
2653                         }
2654                 }
2655
2656                 p_port = p_min_port;
2657                 p_remote_sw->p_osm_sw->new_lft[target_lid] =
2658                     p_port->remote_port_num;
2659
2660                 /* On the remote switch that is pointed by the p_group,
2661                    set hops for ALL the ports in the remote group. */
2662
2663                 set_hops_on_remote_sw(p_group, target_lid,
2664                                       current_hops + 1, is_target_a_sw);
2665
2666                 /* Recursion step:
2667                    Assign downgoing ports by stepping up, starting on REMOTE switch. */
2668                 routed = fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,       /* remote switch - used as a route-downgoing alg. next step point */
2669                                                             p_sw,       /* this switch - prev. position switch for the function */
2670                                                             target_lid, /* LID that we're routing to */
2671                                                             FALSE,      /* whether this is path to HCA that should by tracked by counters */
2672                                                             is_target_a_sw,     /* Whether target lid is a switch or not */
2673                                                             reverse_hop_credit, /* Remaining reverse_hops allowed */
2674                                                             reverse_hops,       /* Number of reverse_hops done up to this point */
2675                                                             current_hops + 1);
2676                 created_route |= routed;
2677         }
2678
2679         /* Now doing the same thing with horizontal links */
2680         if (p_sw->sibling_port_groups_num > 0)
2681                 bubble_sort_down(p_sw->sibling_port_groups,
2682                                  p_sw->sibling_port_groups_num);
2683
2684         for (i = 0; i < p_sw->sibling_port_groups_num; i++) {
2685                 p_group = p_sw->sibling_port_groups[i];
2686                 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2687
2688                 /* skip if target lid has been already set on remote switch fwd tbl (with a bigger hop count) */
2689                 if (p_remote_sw->p_osm_sw->new_lft[target_lid] != OSM_NO_PATH)
2690                         if (current_hops + 1 >=
2691                             sw_get_least_hops(p_remote_sw, target_lid))
2692                                 continue;
2693
2694                 if (p_sw->is_leaf) {
2695                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2696                                 " - Routing SECONDARY path for LID %u: %s --> %s\n",
2697                                 target_lid,
2698                                 tuple_to_str(p_sw->tuple),
2699                                 tuple_to_str(p_remote_sw->tuple));
2700                 }
2701
2702                 /* Routing REAL lids on SECONDARY path means routing
2703                    switch-to-switch or switch-to-CA paths.
2704                    We can safely assume that switch will initiate very
2705                    few traffic, so there's no point wasting runtime on
2706                    trying to balance these routes - always pick port 0. */
2707
2708                 p_min_port = NULL;
2709                 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
2710                 for (j = 0; j < ports_num; j++) {
2711                         cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
2712                         if (!p_min_port) {
2713                                 /* first port that we're checking - use
2714                                    it as a port with the lowest load */
2715                                 p_min_port = p_port;
2716                         } else if (p_port->counter_down <
2717                                    p_min_port->counter_down) {
2718                                 /* this port is less loaded - use it as min */
2719                                 p_min_port = p_port;
2720                         }
2721                 }
2722
2723                 p_port = p_min_port;
2724                 p_remote_sw->p_osm_sw->new_lft[target_lid] =
2725                     p_port->remote_port_num;
2726
2727                 /* On the remote switch that is pointed by the p_group,
2728                    set hops for ALL the ports in the remote group. */
2729
2730                 set_hops_on_remote_sw(p_group, target_lid,
2731                                       current_hops + 1, is_target_a_sw);
2732
2733                 /* Recursion step:
2734                    Assign downgoing ports by stepping up, starting on REMOTE switch. */
2735                 routed = fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,       /* remote switch - used as a route-downgoing alg. next step point */
2736                                                             p_sw,       /* this switch - prev. position switch for the function */
2737                                                             target_lid, /* LID that we're routing to */
2738                                                             FALSE,      /* whether this is path to HCA that should by tracked by counters */
2739                                                             is_target_a_sw,     /* Whether target lid is a switch or not */
2740                                                             reverse_hop_credit, /* Remaining reverse_hops allowed */
2741                                                             reverse_hops,       /* Number of reverse_hops done up to this point */
2742                                                             current_hops + 1);
2743                 created_route |= routed;
2744                 if (routed) {
2745                         p_min_group->counter_down++;
2746                         p_min_port->counter_down++;
2747                 }
2748         }
2749
2750         /* If we don't have any reverse hop credits, we are done */
2751         if (reverse_hop_credit == 0)
2752                 return created_route;
2753
2754         if (p_sw->is_leaf)
2755                 return created_route;
2756
2757         /* We explore all the down group ports */
2758         /* We try to reverse jump for each of them */
2759         /* They already have a route to us from the upgoing_by_going_down started earlier */
2760         /* This is only so it'll continue exploring up, after this step backwards */
2761         for (i = 0; i < p_sw->down_port_groups_num; i++) {
2762                 p_group = p_sw->down_port_groups[i];
2763                 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2764
2765                 /* Skip this port group unless it points to a switch */
2766                 if (p_group->remote_node_type != IB_NODE_TYPE_SWITCH)
2767                         continue;
2768
2769                 /* Recursion step:
2770                    Assign downgoing ports by stepping up, fter doing one step down starting on REMOTE switch. */
2771                 created_route |= fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,       /* remote switch - used as a route-downgoing alg. next step point */
2772                                                                     p_sw,       /* this switch - prev. position switch for the function */
2773                                                                     target_lid, /* LID that we're routing to */
2774                                                                     TRUE,       /* whether this is path to HCA that should by tracked by counters */
2775                                                                     is_target_a_sw,     /* Whether target lid is a switch or not */
2776                                                                     reverse_hop_credit - 1,     /* Remaining reverse_hops allowed */
2777                                                                     reverse_hops + 1,   /* Number of reverse_hops done up to this point */
2778                                                                     current_hops
2779                                                                     + 1);
2780         }
2781         return created_route;
2782
2783 }                               /* ftree_fabric_route_downgoing_by_going_up() */
2784
2785 /***************************************************/
2786
2787 /*
2788  * Pseudo code:
2789  *    foreach leaf switch (in indexing order)
2790  *       for each compute node (in indexing order)
2791  *          obtain the LID of the compute node
2792  *          set local LFT(LID) of the port connecting to compute node
2793  *          call assign-down-going-port-by-ascending-up(TRUE,TRUE) on CURRENT switch
2794  *       for each MISSING compute node
2795  *          call assign-down-going-port-by-ascending-up(FALSE,TRUE) on CURRENT switch
2796  */
2797
2798 static void fabric_route_to_cns(IN ftree_fabric_t * p_ftree)
2799 {
2800         ftree_sw_t *p_sw;
2801         ftree_hca_t *p_hca;
2802         ftree_port_group_t *p_leaf_port_group;
2803         ftree_port_group_t *p_hca_port_group;
2804         ftree_port_t *p_port;
2805         unsigned int i, j;
2806         uint16_t hca_lid;
2807         unsigned routed_targets_on_leaf;
2808
2809         OSM_LOG_ENTER(&p_ftree->p_osm->log);
2810
2811         /* for each leaf switch (in indexing order) */
2812         for (i = 0; i < p_ftree->leaf_switches_num; i++) {
2813                 p_sw = p_ftree->leaf_switches[i];
2814                 routed_targets_on_leaf = 0;
2815
2816                 /* for each HCA connected to this switch */
2817                 for (j = 0; j < p_sw->down_port_groups_num; j++) {
2818                         p_leaf_port_group = p_sw->down_port_groups[j];
2819
2820                         /* work with this port group only if the remote node is CA */
2821                         if (p_leaf_port_group->remote_node_type !=
2822                             IB_NODE_TYPE_CA)
2823                                 continue;
2824
2825                         p_hca = p_leaf_port_group->remote_hca_or_sw.p_hca;
2826
2827                         /* work with this port group only if remote HCA has CNs */
2828                         if (!p_hca->cn_num)
2829                                 continue;
2830
2831                         p_hca_port_group =
2832                             hca_get_port_group_by_lid(p_hca,
2833                                                       p_leaf_port_group->
2834                                                       remote_lid);
2835                         CL_ASSERT(p_hca_port_group);
2836
2837                         /* work with this port group only if remote port is CN */
2838                         if (!p_hca_port_group->is_cn)
2839                                 continue;
2840
2841                         /* obtain the LID of HCA port */
2842                         hca_lid = p_leaf_port_group->remote_lid;
2843
2844                         /* set local LFT(LID) to the port that is connected to HCA */
2845                         cl_ptr_vector_at(&p_leaf_port_group->ports, 0,
2846                                          (void *)&p_port);
2847                         p_sw->p_osm_sw->new_lft[hca_lid] = p_port->port_num;
2848
2849                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2850                                 "Switch %s: set path to CN LID %u through port %u\n",
2851                                 tuple_to_str(p_sw->tuple),
2852                                 hca_lid, p_port->port_num);
2853
2854                         /* set local min hop table(LID) to route to the CA */
2855                         sw_set_hops(p_sw, hca_lid, p_port->port_num, 1, FALSE);
2856
2857                         /* Assign downgoing ports by stepping up.
2858                            Since we're routing here only CNs, we're routing it as REAL
2859                            LID and updating fat-tree balancing counters. */
2860                         fabric_route_downgoing_by_going_up(p_ftree, p_sw,       /* local switch - used as a route-downgoing alg. start point */
2861                                                            NULL,        /* prev. position switch */
2862                                                            hca_lid,     /* LID that we're routing to */
2863                                                            TRUE,        /* whether this path to HCA should by tracked by counters */
2864                                                            FALSE,       /* whether target lid is a switch or not */
2865                                                            0,   /* Number of reverse hops allowed */
2866                                                            0,   /* Number of reverse hops done yet */
2867                                                            1);  /* Number of hops done yet */
2868
2869                         /* count how many real targets have been routed from this leaf switch */
2870                         routed_targets_on_leaf++;
2871                 }
2872
2873                 /* We're done with the real targets (all CNs) of this leaf switch.
2874                    Now route the dummy HCAs that are missing or that are non-CNs.
2875                    When routing to dummy HCAs we don't fill lid matrices. */
2876                 if (p_ftree->max_cn_per_leaf > routed_targets_on_leaf) {
2877                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2878                                 "Routing %u dummy CAs\n",
2879                                 p_ftree->max_cn_per_leaf -
2880                                 p_sw->down_port_groups_num);
2881                         for (j = 0; j <
2882                              p_ftree->max_cn_per_leaf - routed_targets_on_leaf;
2883                              j++) {
2884                                 ftree_sw_t *p_next_sw, *p_ftree_sw;
2885                                 sw_set_hops(p_sw, 0, 0xFF, 1, FALSE);
2886                                 /* assign downgoing ports by stepping up */
2887                                 fabric_route_downgoing_by_going_up(p_ftree, p_sw,       /* local switch - used as a route-downgoing alg. start point */
2888                                                                    NULL,        /* prev. position switch */
2889                                                                    0,   /* LID that we're routing to - ignored for dummy HCA */
2890                                                                    TRUE,        /* whether this path to HCA should by tracked by counters */
2891                                                                    FALSE,       /* Whether the target LID is a switch or not */
2892                                                                    0,   /* Number of reverse hops allowed */
2893                                                                    0,   /* Number of reverse hops done yet */
2894                                                                    1);  /* Number of hops done yet */
2895
2896                                 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
2897                                 /* need to clean the LID 0 hops for dummy node */
2898                                 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
2899                                         p_ftree_sw = p_next_sw;
2900                                         p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_ftree_sw->map_item);
2901                                         p_ftree_sw->hops[0] = OSM_NO_PATH;
2902                                         p_ftree_sw->p_osm_sw->new_lft[0] = OSM_NO_PATH;
2903                                 }
2904
2905                         }
2906                 }
2907         }
2908         /* done going through all the leaf switches */
2909         OSM_LOG_EXIT(&p_ftree->p_osm->log);
2910 }                               /* fabric_route_to_cns() */
2911
2912 /***************************************************/
2913
2914 /*
2915  * Pseudo code:
2916  *    foreach HCA non-CN port in fabric
2917  *       obtain the LID of the HCA port
2918  *       get switch that is connected to this HCA port
2919  *       set switch LFT(LID) to the port connected to the HCA port
2920  *       call assign-down-going-port-by-ascending-up(TRUE,TRUE) on the switch
2921  *
2922  * Routing to these HCAs is routing a REAL hca lid on MAIN path.
2923  * We want to allow load-leveling of the traffic to the non-CNs,
2924  * because such nodes may include IO nodes with heavy usage
2925  *   - we should set fwd tables
2926  *   - we should update port counters
2927  * Routing to non-CNs is done after routing to CNs, so updated port
2928  * counters will not affect CN-to-CN routing.
2929  */
2930
2931 static void fabric_route_to_non_cns(IN ftree_fabric_t * p_ftree)
2932 {
2933         ftree_sw_t *p_sw;
2934         ftree_hca_t *p_hca;
2935         ftree_hca_t *p_next_hca;
2936         ftree_port_t *p_hca_port;
2937         ftree_port_group_t *p_hca_port_group;
2938         uint16_t hca_lid;
2939         unsigned port_num_on_switch;
2940         unsigned i;
2941
2942         OSM_LOG_ENTER(&p_ftree->p_osm->log);
2943
2944         p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
2945         while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
2946                 p_hca = p_next_hca;
2947                 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
2948
2949                 for (i = 0; i < p_hca->up_port_groups_num; i++) {
2950                         p_hca_port_group = p_hca->up_port_groups[i];
2951
2952                         /* skip this port if it's CN, in which case it has been already routed */
2953                         if (p_hca_port_group->is_cn)
2954                                 continue;
2955
2956                         /* skip this port if it is not connected to switch */
2957                         if (p_hca_port_group->remote_node_type !=
2958                             IB_NODE_TYPE_SWITCH)
2959                                 continue;
2960
2961                         p_sw = p_hca_port_group->remote_hca_or_sw.p_sw;
2962                         hca_lid = p_hca_port_group->lid;
2963
2964                         /* set switches  LFT(LID) to the port that is connected to HCA */
2965                         cl_ptr_vector_at(&p_hca_port_group->ports, 0,
2966                                          (void *)&p_hca_port);
2967                         port_num_on_switch = p_hca_port->remote_port_num;
2968                         p_sw->p_osm_sw->new_lft[hca_lid] = port_num_on_switch;
2969
2970                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2971                                 "Switch %s: set path to non-CN HCA LID %u through port %u\n",
2972                                 tuple_to_str(p_sw->tuple),
2973                                 hca_lid, port_num_on_switch);
2974
2975                         /* set local min hop table(LID) to route to the CA */
2976                         sw_set_hops(p_sw, hca_lid, port_num_on_switch,  /* port num */
2977                                     1, FALSE);  /* hops */
2978
2979                         /* Assign downgoing ports by stepping up.
2980                            We're routing REAL targets. They are not CNs and not included
2981                            in the leafs array, but we treat them as MAIN path to allow load
2982                            leveling, which means that the counters will be updated. */
2983                         fabric_route_downgoing_by_going_up(p_ftree, p_sw,       /* local switch - used as a route-downgoing alg. start point */
2984                                                            NULL,        /* prev. position switch */
2985                                                            hca_lid,     /* LID that we're routing to */
2986                                                            TRUE,        /* whether this path to HCA should by tracked by counters */
2987                                                            FALSE,       /* Whether the target LID is a switch or not */
2988                                                            p_hca_port_group->is_io ? p_ftree->p_osm->subn.opt.max_reverse_hops : 0,     /* Number or reverse hops allowed */
2989                                                            0,   /* Number or reverse hops done yet */
2990                                                            1);  /* Number of hops done yet */
2991                 }
2992                 /* done with all the port groups of this HCA - go to next HCA */
2993         }
2994
2995         OSM_LOG_EXIT(&p_ftree->p_osm->log);
2996 }                               /* fabric_route_to_non_cns() */
2997
2998 /***************************************************/
2999
3000 /*
3001  * Pseudo code:
3002  *    foreach switch in fabric
3003  *       obtain its LID
3004  *       set local LFT(LID) to port 0
3005  *       call assign-down-going-port-by-ascending-up(TRUE,FALSE) on CURRENT switch
3006  *
3007  * Routing to switch is similar to routing a REAL hca lid on SECONDARY path:
3008  *   - we should set fwd tables
3009  *   - we should NOT update port counters
3010  */
3011
3012 static void fabric_route_to_switches(IN ftree_fabric_t * p_ftree)
3013 {
3014         ftree_sw_t *p_sw;
3015         ftree_sw_t *p_next_sw;
3016
3017         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3018
3019         p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3020         while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
3021                 p_sw = p_next_sw;
3022                 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
3023
3024                 /* set local LFT(LID) to 0 (route to itself) */
3025                 p_sw->p_osm_sw->new_lft[p_sw->lid] = 0;
3026
3027                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3028                         "Switch %s (LID %u): routing switch-to-switch paths\n",
3029                         tuple_to_str(p_sw->tuple), p_sw->lid);
3030
3031                 /* set min hop table of the switch to itself */
3032                 sw_set_hops(p_sw, p_sw->lid, 0, /* port_num */
3033                             0, TRUE);   /* hops     */
3034
3035                 fabric_route_downgoing_by_going_up(p_ftree, p_sw,       /* local switch - used as a route-downgoing alg. start point */
3036                                                    NULL,        /* prev. position switch */
3037                                                    p_sw->lid,   /* LID that we're routing to */
3038                                                    FALSE,       /* whether this path to HCA should by tracked by counters */
3039                                                    TRUE,        /* Whether the target LID is a switch or not */
3040                                                    0,   /* Number of reverse hops allowed */
3041                                                    0,   /* Number of reverse hops done yet */
3042                                                    0);  /* Number of hops done yet */
3043         }
3044
3045         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3046 }                               /* fabric_route_to_switches() */
3047
3048 /***************************************************
3049  ***************************************************/
3050
3051 static void fabric_route_roots(IN ftree_fabric_t * p_ftree)
3052 {
3053         uint16_t lid;
3054         uint8_t port_num;
3055         osm_port_t *p_port;
3056         ftree_sw_t *p_sw;
3057         ftree_sw_t *p_leaf_sw;
3058
3059         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3060
3061         /*
3062          * We need a switch that will accomodate all the down/up turns in
3063          * the fabric. Having these turn in a single place in the fabric
3064          * will not create credit loops.
3065          * So we need to select this switch.
3066          * The idea here is to chose leaf with the highest index. I don't
3067          * have any theory to back me up on this. It's just a general thought
3068          * that this way the switch that might be a bottleneck for many mcast
3069          * groups will be far away from the OpenSM, so it will draw the
3070          * multicast traffic away from the SM.
3071          */
3072
3073         p_leaf_sw = p_ftree->leaf_switches[p_ftree->leaf_switches_num-1];
3074
3075         /*
3076          * Now go over all the switches in the fabric that
3077          * have lower rank, and route the missing LIDs to
3078          * the selected leaf switch.
3079          * In short, this leaf switch now poses a target
3080          * for all those missing LIDs.
3081          */
3082
3083         for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3084              p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
3085              p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
3086
3087                 if (p_sw->rank >= p_ftree->leaf_switch_rank)
3088                         continue;
3089
3090                 for (lid = 1; lid <= p_leaf_sw->p_osm_sw->max_lid_ho; lid ++) {
3091
3092                         if (p_sw->p_osm_sw->new_lft[lid] != OSM_NO_PATH ||
3093                             p_leaf_sw->hops[lid] == OSM_NO_PATH)
3094                                 continue;
3095
3096                         p_port = osm_get_port_by_lid_ho(&p_ftree->p_osm->subn,
3097                                                         lid);
3098
3099                         /* we're interested only in switches */
3100                         if (!p_port || !p_port->p_node->sw)
3101                                 continue;
3102
3103                         /*
3104                          * the missing LID will be routed through the same
3105                          * port that routes to the selected leaf switch
3106                          */
3107                         port_num = p_sw->p_osm_sw->new_lft[p_leaf_sw->lid];
3108
3109                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3110                                 "Switch %s: setting path to LID %u "
3111                                 "through port %u\n",
3112                                 tuple_to_str(p_sw->tuple), lid, port_num);
3113
3114                         /* set local lft */
3115                         p_sw->p_osm_sw->new_lft[lid] = port_num;
3116
3117                         /*
3118                          * Set local min hop table.
3119                          * The distance to the target LID is a distance
3120                          * to the selected leaf switch plus the distance
3121                          * from the leaf to the target LID.
3122                          */
3123                         sw_set_hops(p_sw, lid, port_num,
3124                                 p_sw->hops[p_leaf_sw->lid] +
3125                                 p_leaf_sw->hops[lid], TRUE);
3126                 }
3127         }
3128
3129         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3130 }                               /* fabric_route_roots() */
3131
3132 /***************************************************/
3133
3134 static int fabric_populate_nodes(IN ftree_fabric_t * p_ftree)
3135 {
3136         osm_node_t *p_osm_node;
3137         osm_node_t *p_next_osm_node;
3138
3139         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3140
3141         p_next_osm_node =
3142             (osm_node_t *) cl_qmap_head(&p_ftree->p_osm->subn.node_guid_tbl);
3143         while (p_next_osm_node !=
3144                (osm_node_t *) cl_qmap_end(&p_ftree->p_osm->
3145                                           subn.node_guid_tbl)) {
3146                 p_osm_node = p_next_osm_node;
3147                 p_next_osm_node =
3148                     (osm_node_t *) cl_qmap_next(&p_osm_node->map_item);
3149                 switch (osm_node_get_type(p_osm_node)) {
3150                 case IB_NODE_TYPE_CA:
3151                         fabric_add_hca(p_ftree, p_osm_node);
3152                         break;
3153                 case IB_NODE_TYPE_ROUTER:
3154                         break;
3155                 case IB_NODE_TYPE_SWITCH:
3156                         fabric_add_sw(p_ftree, p_osm_node->sw);
3157                         break;
3158                 default:
3159                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3160                                 "ERR AB0E: " "Node GUID 0x%016" PRIx64
3161                                 " - Unknown node type: %s\n",
3162                                 cl_ntoh64(osm_node_get_node_guid(p_osm_node)),
3163                                 ib_get_node_type_str(osm_node_get_type
3164                                                      (p_osm_node)));
3165                         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3166                         return -1;
3167                 }
3168         }
3169
3170         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3171         return 0;
3172 }                               /* fabric_populate_nodes() */
3173
3174 /***************************************************
3175  ***************************************************/
3176
3177 static boolean_t sw_update_rank(IN ftree_sw_t * p_sw, IN uint32_t new_rank)
3178 {
3179         if (sw_ranked(p_sw) && p_sw->rank <= new_rank)
3180                 return FALSE;
3181         p_sw->rank = new_rank;
3182         return TRUE;
3183
3184 }
3185
3186 /***************************************************/
3187
3188 static void rank_switches_from_leafs(IN ftree_fabric_t * p_ftree,
3189                                      IN cl_list_t * p_ranking_bfs_list)
3190 {
3191         ftree_sw_t *p_sw;
3192         ftree_sw_t *p_remote_sw;
3193         osm_node_t *p_node;
3194         osm_node_t *p_remote_node;
3195         osm_physp_t *p_osm_port;
3196         uint8_t i;
3197         unsigned max_rank = 0;
3198
3199         while (!cl_is_list_empty(p_ranking_bfs_list)) {
3200                 p_sw = (ftree_sw_t *) cl_list_remove_head(p_ranking_bfs_list);
3201                 p_node = p_sw->p_osm_sw->p_node;
3202
3203                 /* note: skipping port 0 on switches */
3204                 for (i = 1; i < osm_node_get_num_physp(p_node); i++) {
3205                         p_osm_port = osm_node_get_physp_ptr(p_node, i);
3206                         if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
3207                                 continue;
3208
3209                         p_remote_node =
3210                             osm_node_get_remote_node(p_node, i, NULL);
3211                         if (!p_remote_node)
3212                                 continue;
3213                         if (osm_node_get_type(p_remote_node) !=
3214                             IB_NODE_TYPE_SWITCH)
3215                                 continue;
3216
3217                         p_remote_sw = fabric_get_sw_by_guid(p_ftree,
3218                                                             osm_node_get_node_guid
3219                                                             (p_remote_node));
3220                         if (!p_remote_sw) {
3221                                 /* remote node is not a switch */
3222                                 continue;
3223                         }
3224
3225                         /* if needed, rank the remote switch and add it to the BFS list */
3226                         if (sw_update_rank(p_remote_sw, p_sw->rank + 1)) {
3227                                 max_rank = p_remote_sw->rank;
3228                                 cl_list_insert_tail(p_ranking_bfs_list,
3229                                                     p_remote_sw);
3230                         }
3231                 }
3232         }
3233
3234         /* set FatTree maximal switch rank */
3235         p_ftree->max_switch_rank = max_rank;
3236
3237 }                               /* rank_switches_from_leafs() */
3238
3239 /***************************************************/
3240
3241 static int rank_leaf_switches(IN ftree_fabric_t * p_ftree,
3242                               IN ftree_hca_t * p_hca,
3243                               IN cl_list_t * p_ranking_bfs_list)
3244 {
3245         ftree_sw_t *p_sw;
3246         osm_node_t *p_osm_node = p_hca->p_osm_node;
3247         osm_node_t *p_remote_osm_node;
3248         osm_physp_t *p_osm_port;
3249         static uint8_t i = 0;
3250         int res = 0;
3251
3252         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3253
3254         for (i = 0; i < osm_node_get_num_physp(p_osm_node); i++) {
3255                 p_osm_port = osm_node_get_physp_ptr(p_osm_node, i);
3256                 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
3257                         continue;
3258
3259                 p_remote_osm_node =
3260                     osm_node_get_remote_node(p_osm_node, i, NULL);
3261                 if (!p_remote_osm_node)
3262                         continue;
3263
3264                 switch (osm_node_get_type(p_remote_osm_node)) {
3265                 case IB_NODE_TYPE_CA:
3266                         /* HCA connected directly to another HCA - not FatTree */
3267                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3268                                 "ERR AB0F: "
3269                                 "CA conected directly to another CA: " "0x%016"
3270                                 PRIx64 " <---> 0x%016" PRIx64 "\n",
3271                                 hca_get_guid_ho(p_hca),
3272                                 cl_ntoh64(osm_node_get_node_guid
3273                                           (p_remote_osm_node)));
3274                         res = -1;
3275                         goto Exit;
3276
3277                 case IB_NODE_TYPE_ROUTER:
3278                         /* leaving this port - proceeding to the next one */
3279                         continue;
3280
3281                 case IB_NODE_TYPE_SWITCH:
3282                         /* continue with this port */
3283                         break;
3284
3285                 default:
3286                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3287                                 "ERR AB10: Node GUID 0x%016" PRIx64
3288                                 " - Unknown node type: %s\n",
3289                                 cl_ntoh64(osm_node_get_node_guid
3290                                           (p_remote_osm_node)),
3291                                 ib_get_node_type_str(osm_node_get_type
3292                                                      (p_remote_osm_node)));
3293                         res = -1;
3294                         goto Exit;
3295                 }
3296
3297                 /* remote node is switch */
3298
3299                 p_sw = fabric_get_sw_by_guid(p_ftree,
3300                                              osm_node_get_node_guid
3301                                              (p_osm_port->p_remote_physp->
3302                                               p_node));
3303                 CL_ASSERT(p_sw);
3304
3305                 /* if needed, rank the remote switch and add it to the BFS list */
3306
3307                 if (!sw_update_rank(p_sw, 0))
3308                         continue;
3309                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3310                         "Marking rank of switch that is directly connected to CA:\n"
3311                         "                                            - CA guid    : 0x%016"
3312                         PRIx64 "\n"
3313                         "                                            - Switch guid: 0x%016"
3314                         PRIx64 "\n"
3315                         "                                            - Switch LID : %u\n",
3316                         hca_get_guid_ho(p_hca),
3317                         sw_get_guid_ho(p_sw), p_sw->lid);
3318                 cl_list_insert_tail(p_ranking_bfs_list, p_sw);
3319         }
3320
3321 Exit:
3322         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3323         return res;
3324 }                               /* rank_leaf_switches() */
3325
3326 /***************************************************/
3327
3328 static void sw_reverse_rank(IN cl_map_item_t * const p_map_item,
3329                             IN void *context)
3330 {
3331         ftree_fabric_t *p_ftree = (ftree_fabric_t *) context;
3332         ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item;
3333         if (p_sw->rank != 0xFFFFFFFF)
3334                 p_sw->rank = p_ftree->max_switch_rank - p_sw->rank;
3335 }
3336
3337 /***************************************************
3338  ***************************************************/
3339
3340 static int
3341 fabric_construct_hca_ports(IN ftree_fabric_t * p_ftree, IN ftree_hca_t * p_hca)
3342 {
3343         ftree_sw_t *p_remote_sw;
3344         osm_node_t *p_node = p_hca->p_osm_node;
3345         osm_node_t *p_remote_node;
3346         uint8_t remote_node_type;
3347         ib_net64_t remote_node_guid;
3348         osm_physp_t *p_remote_osm_port;
3349         uint8_t i;
3350         uint8_t remote_port_num;
3351         boolean_t is_cn;
3352         boolean_t is_in_cn_file;
3353         boolean_t is_io;
3354         boolean_t is_cns_file_provided = fabric_cns_provided(p_ftree);
3355         boolean_t is_ios_file_provided = fabric_ios_provided(p_ftree);
3356         int res = 0;
3357
3358         for (i = 0; i < osm_node_get_num_physp(p_node); i++) {
3359                 osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i);
3360                 is_io = FALSE;
3361                 is_cn = TRUE;
3362                 is_in_cn_file = FALSE;
3363
3364                 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
3365                         continue;
3366
3367                 if (p_hca->disconnected_ports[i])
3368                         continue;
3369
3370                 p_remote_osm_port = osm_physp_get_remote(p_osm_port);
3371                 p_remote_node =
3372                     osm_node_get_remote_node(p_node, i, &remote_port_num);
3373
3374                 if (!p_remote_osm_port || !p_remote_node)
3375                         continue;
3376
3377                 remote_node_type = osm_node_get_type(p_remote_node);
3378                 remote_node_guid = osm_node_get_node_guid(p_remote_node);
3379
3380                 switch (remote_node_type) {
3381                 case IB_NODE_TYPE_ROUTER:
3382                         /* leaving this port - proceeding to the next one */
3383                         continue;
3384
3385                 case IB_NODE_TYPE_CA:
3386                         /* HCA connected directly to another HCA - not FatTree */
3387                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3388                                 "ERR AB11: "
3389                                 "CA conected directly to another CA: " "0x%016"
3390                                 PRIx64 " <---> 0x%016" PRIx64 "\n",
3391                                 cl_ntoh64(osm_node_get_node_guid(p_node)),
3392                                 cl_ntoh64(remote_node_guid));
3393                         res = -1;
3394                         goto Exit;
3395
3396                 case IB_NODE_TYPE_SWITCH:
3397                         /* continue with this port */
3398                         break;
3399
3400                 default:
3401                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3402                                 "ERR AB12: Node GUID 0x%016" PRIx64
3403                                 " - Unknown node type: %s\n",
3404                                 cl_ntoh64(remote_node_guid),
3405                                 ib_get_node_type_str(remote_node_type));
3406                         res = -1;
3407                         goto Exit;
3408                 }
3409
3410                 /* remote node is switch */
3411
3412                 p_remote_sw = fabric_get_sw_by_guid(p_ftree, remote_node_guid);
3413                 CL_ASSERT(p_remote_sw);
3414
3415                 /* If CN file is not supplied, then all the CAs considered as Compute Nodes.
3416                    Otherwise all the CAs are not CNs, and only guids that are present in the
3417                    CN file will be marked as compute nodes. */
3418                 if (is_cns_file_provided == TRUE) {
3419                         name_map_item_t *p_elem = (name_map_item_t *)
3420                         cl_qmap_get(&p_ftree->cn_guid_tbl,
3421                                     cl_ntoh64(osm_physp_get_port_guid
3422                                              (p_osm_port)));
3423                         if (p_elem == (name_map_item_t *)
3424                                 cl_qmap_end(&p_ftree->cn_guid_tbl))
3425                                 is_cn = FALSE;
3426                         else
3427                                 is_in_cn_file = TRUE;
3428                 }
3429                 if (is_in_cn_file == FALSE && is_ios_file_provided == TRUE) {
3430                         name_map_item_t *p_elem = (name_map_item_t *)
3431                         cl_qmap_get(&p_ftree->io_guid_tbl,
3432                                     cl_ntoh64(osm_physp_get_port_guid
3433                                              (p_osm_port)));
3434                         if (p_elem != (name_map_item_t *)
3435                                 cl_qmap_end(&p_ftree->io_guid_tbl)) {
3436                                 is_io = TRUE;
3437                                 is_cn = FALSE;
3438                         }
3439                 }
3440
3441                 if (is_cn) {
3442                         p_ftree->cn_num++;
3443                         p_hca->cn_num++;
3444                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3445                                 "Marking CN port GUID 0x%016" PRIx64 "\n",
3446                                 cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
3447                 } else if (is_io) {
3448                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3449                                 "Marking I/O port GUID 0x%016" PRIx64 "\n",
3450                                 cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
3451                 } else {
3452                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3453                                 "Marking non-CN port GUID 0x%016" PRIx64 "\n",
3454                                 cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
3455                 }
3456                 p_ftree->ca_ports++;
3457
3458                 hca_add_port(p_ftree,
3459                              p_hca,     /* local ftree_hca object */
3460                              i, /* local port number */
3461                              remote_port_num,   /* remote port number */
3462                              cl_ntoh16(osm_node_get_base_lid(p_node, i)),       /* local lid */
3463                              cl_ntoh16(osm_node_get_base_lid(p_remote_node, 0)),        /* remote lid */
3464                              osm_physp_get_port_guid(p_osm_port),       /* local port guid */
3465                              osm_physp_get_port_guid(p_remote_osm_port),        /* remote port guid */
3466                              remote_node_guid,  /* remote node guid */
3467                              remote_node_type,  /* remote node type */
3468                              (void *)p_remote_sw,       /* remote ftree_hca/sw object */
3469                              is_cn, is_io);     /* whether this port is compute node */
3470         }
3471
3472 Exit:
3473         return res;
3474 }                               /* fabric_construct_hca_ports() */
3475
3476 /***************************************************
3477  ***************************************************/
3478
3479 static int fabric_construct_sw_ports(IN ftree_fabric_t * p_ftree,
3480                                      IN ftree_sw_t * p_sw)
3481 {
3482         ftree_hca_t *p_remote_hca;
3483         ftree_sw_t *p_remote_sw;
3484         osm_node_t *p_node = p_sw->p_osm_sw->p_node;
3485         osm_node_t *p_remote_node;
3486         uint16_t remote_lid;
3487         uint8_t remote_node_type;
3488         ib_net64_t remote_node_guid;
3489         osm_physp_t *p_remote_osm_port;
3490         ftree_direction_t direction;
3491         void *p_remote_hca_or_sw;
3492         uint8_t i;
3493         uint8_t remote_port_num;
3494         int res = 0;
3495
3496         CL_ASSERT(osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH);
3497
3498         for (i = 1; i < osm_node_get_num_physp(p_node); i++) {
3499                 osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i);
3500                 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
3501                         continue;
3502
3503                 p_remote_osm_port = osm_physp_get_remote(p_osm_port);
3504                 if (!p_remote_osm_port)
3505                         continue;
3506
3507                 p_remote_node =
3508                     osm_node_get_remote_node(p_node, i, &remote_port_num);
3509                 if (!p_remote_node)
3510                         continue;
3511
3512                 /* ignore any loopback connection on switch */
3513                 if (p_node == p_remote_node) {
3514                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3515                                 "Ignoring loopback on switch GUID 0x%016" PRIx64
3516                                 ", LID %u, rank %u\n",
3517                                 sw_get_guid_ho(p_sw),
3518                                 p_sw->lid, p_sw->rank);
3519                         continue;
3520                 }
3521
3522                 remote_node_type = osm_node_get_type(p_remote_node);
3523                 remote_node_guid = osm_node_get_node_guid(p_remote_node);
3524
3525                 switch (remote_node_type) {
3526                 case IB_NODE_TYPE_ROUTER:
3527                         /* leaving this port - proceeding to the next one */
3528                         continue;
3529
3530                 case IB_NODE_TYPE_CA:
3531                         /* switch connected to hca */
3532
3533                         p_remote_hca =
3534                             fabric_get_hca_by_guid(p_ftree, remote_node_guid);
3535                         CL_ASSERT(p_remote_hca);
3536
3537                         p_remote_hca_or_sw = (void *)p_remote_hca;
3538                         direction = FTREE_DIRECTION_DOWN;
3539
3540                         remote_lid =
3541                             cl_ntoh16(osm_physp_get_base_lid(p_remote_osm_port));
3542                         break;
3543
3544                 case IB_NODE_TYPE_SWITCH:
3545                         /* switch connected to another switch */
3546
3547                         p_remote_sw =
3548                             fabric_get_sw_by_guid(p_ftree, remote_node_guid);
3549                         CL_ASSERT(p_remote_sw);
3550
3551                         p_remote_hca_or_sw = (void *)p_remote_sw;
3552
3553                         if (p_sw->rank > p_remote_sw->rank) {
3554                                 direction = FTREE_DIRECTION_UP;
3555                         } else if (p_sw->rank == p_remote_sw->rank) {
3556                                 direction = FTREE_DIRECTION_SAME;
3557                         } else
3558                                 direction = FTREE_DIRECTION_DOWN;
3559
3560                         /* switch LID is only in port 0 port_info structure */
3561                         remote_lid =
3562                             cl_ntoh16(osm_node_get_base_lid(p_remote_node, 0));
3563
3564                         break;
3565
3566                 default:
3567                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3568                                 "ERR AB13: Node GUID 0x%016" PRIx64
3569                                 " - Unknown node type: %s\n",
3570                                 cl_ntoh64(remote_node_guid),
3571                                 ib_get_node_type_str(remote_node_type));
3572                         res = -1;
3573                         goto Exit;
3574                 }
3575                 sw_add_port(p_sw,       /* local ftree_sw object */
3576                             i,  /* local port number */
3577                             remote_port_num,    /* remote port number */
3578                             p_sw->lid,  /* local lid */
3579                             remote_lid, /* remote lid */
3580                             osm_physp_get_port_guid(p_osm_port),        /* local port guid */
3581                             osm_physp_get_port_guid(p_remote_osm_port), /* remote port guid */
3582                             remote_node_guid,   /* remote node guid */
3583                             remote_node_type,   /* remote node type */
3584                             p_remote_hca_or_sw, /* remote ftree_hca/sw object */
3585                             direction); /* port direction (up or down) */
3586
3587                 /* Track the max lid (in host order) that exists in the fabric */
3588                 if (remote_lid > p_ftree->lft_max_lid)
3589                         p_ftree->lft_max_lid = remote_lid;
3590         }
3591
3592 Exit:
3593         return res;
3594 }                               /* fabric_construct_sw_ports() */
3595
3596 /***************************************************
3597  ***************************************************/
3598 struct rank_root_cxt {
3599         ftree_fabric_t *fabric;
3600         cl_list_t *list;
3601 };
3602 /***************************************************
3603  ***************************************************/
3604 static int rank_root_sw_by_guid(void *cxt, uint64_t guid, char *p)
3605 {
3606         struct rank_root_cxt *c = cxt;
3607         ftree_sw_t *sw;
3608
3609         sw = fabric_get_sw_by_guid(c->fabric, cl_hton64(guid));
3610         if (!sw) {
3611                 /* the specified root guid wasn't found in the fabric */
3612                 OSM_LOG(&c->fabric->p_osm->log, OSM_LOG_ERROR, "ERR AB24: "
3613                         "Root switch GUID 0x%" PRIx64 " not found\n", guid);
3614                 return 0;
3615         }
3616
3617         OSM_LOG(&c->fabric->p_osm->log, OSM_LOG_DEBUG,
3618                 "Ranking root switch with GUID 0x%" PRIx64 "\n", guid);
3619         sw->rank = 0;
3620         cl_list_insert_tail(c->list, sw);
3621
3622         return 0;
3623 }
3624 /***************************************************
3625  ***************************************************/
3626 static boolean_t fabric_load_roots(IN ftree_fabric_t * p_ftree,
3627                                    IN cl_list_t* p_ranking_bfs_list)
3628 {
3629         struct rank_root_cxt context;
3630         unsigned num_roots;
3631
3632         if (p_ranking_bfs_list) {
3633
3634                 /* Rank all the roots and add them to list */
3635                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3636                         "Fetching root nodes from file %s\n",
3637                         p_ftree->p_osm->subn.opt.root_guid_file);
3638
3639                 context.fabric = p_ftree;
3640                 context.list = p_ranking_bfs_list;
3641                 if (parse_node_map(p_ftree->p_osm->subn.opt.root_guid_file,
3642                                    rank_root_sw_by_guid, &context)) {
3643                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB2A: "
3644                                 "cannot parse root guids file \'%s\'\n",
3645                                 p_ftree->p_osm->subn.opt.root_guid_file);
3646                         return FALSE;
3647                 }
3648
3649                 num_roots = cl_list_count(p_ranking_bfs_list);
3650                 if (!num_roots) {
3651                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB25: "
3652                                 "No valid roots supplied\n");
3653                         return FALSE;
3654                 }
3655
3656                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3657                         "Ranked %u valid root switches\n", num_roots);
3658         }
3659         return TRUE;
3660 }
3661 /***************************************************
3662  ***************************************************/
3663 static int fabric_rank_from_roots(IN ftree_fabric_t * p_ftree,
3664                                   IN cl_list_t* p_ranking_bfs_list)
3665 {
3666         osm_node_t *p_osm_node;
3667         osm_node_t *p_remote_osm_node;
3668         osm_physp_t *p_osm_physp;
3669         ftree_sw_t *p_sw;
3670         ftree_sw_t *p_remote_sw;
3671         int res = 0;
3672         unsigned max_rank = 0;
3673         unsigned i;
3674
3675         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3676
3677         if (!p_ranking_bfs_list) {
3678                 res = -1;
3679                 goto Exit;
3680         }
3681         while (!cl_is_list_empty(p_ranking_bfs_list)) {
3682                 p_sw = (ftree_sw_t *) cl_list_remove_head(p_ranking_bfs_list);
3683                 p_osm_node = p_sw->p_osm_sw->p_node;
3684
3685                 /* note: skipping port 0 on switches */
3686                 for (i = 1; i < osm_node_get_num_physp(p_osm_node); i++) {
3687                         p_osm_physp = osm_node_get_physp_ptr(p_osm_node, i);
3688                         if (!p_osm_physp || !osm_link_is_healthy(p_osm_physp))
3689                                 continue;
3690
3691                         p_remote_osm_node =
3692                             osm_node_get_remote_node(p_osm_node, i, NULL);
3693                         if (!p_remote_osm_node)
3694                                 continue;
3695
3696                         if (osm_node_get_type(p_remote_osm_node) !=
3697                             IB_NODE_TYPE_SWITCH)
3698                                 continue;
3699
3700                         p_remote_sw = fabric_get_sw_by_guid(p_ftree,
3701                                                             osm_node_get_node_guid
3702                                                             (p_remote_osm_node));
3703                         CL_ASSERT(p_remote_sw);
3704
3705                         /* if needed, rank the remote switch and add it to the BFS list */
3706                         if (sw_update_rank(p_remote_sw, p_sw->rank + 1)) {
3707                                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3708                                         "Ranking switch 0x%" PRIx64
3709                                         " with rank %u\n",
3710                                         sw_get_guid_ho(p_remote_sw),
3711                                         p_remote_sw->rank);
3712                                 max_rank = p_remote_sw->rank;
3713                                 cl_list_insert_tail(p_ranking_bfs_list,
3714                                                     p_remote_sw);
3715                         }
3716                 }
3717                 /* done with ports of this switch - go to the next switch in the list */
3718         }
3719
3720         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3721                 "Subnet ranking completed. Max Node Rank = %u\n", max_rank);
3722
3723         /* set FatTree maximal switch rank */
3724         p_ftree->max_switch_rank = max_rank;
3725
3726 Exit:
3727         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3728         return res;
3729 }                               /* fabric_rank_from_roots() */
3730
3731 /***************************************************
3732  ***************************************************/
3733
3734 static int fabric_rank_from_hcas(IN ftree_fabric_t * p_ftree)
3735 {
3736         ftree_hca_t *p_hca;
3737         ftree_hca_t *p_next_hca;
3738         cl_list_t ranking_bfs_list;
3739         int res = 0;
3740
3741         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3742
3743         cl_list_init(&ranking_bfs_list, 10);
3744
3745         /* Mark REVERSED rank of all the switches in the subnet.
3746            Start from switches that are connected to hca's, and
3747            scan all the switches in the subnet. */
3748         p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3749         while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3750                 p_hca = p_next_hca;
3751                 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3752                 if (rank_leaf_switches(p_ftree, p_hca, &ranking_bfs_list) != 0) {
3753                         res = -1;
3754                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3755                                 "ERR AB14: "
3756                                 "Subnet ranking failed - subnet is not FatTree");
3757                         goto Exit;
3758                 }
3759         }
3760
3761         /* Now rank rest of the switches in the fabric, while the
3762            list already contains all the ranked leaf switches */
3763         rank_switches_from_leafs(p_ftree, &ranking_bfs_list);
3764
3765         /* fix ranking of the switches by reversing the ranking direction */
3766         cl_qmap_apply_func(&p_ftree->sw_tbl, sw_reverse_rank, (void *)p_ftree);
3767
3768 Exit:
3769         cl_list_destroy(&ranking_bfs_list);
3770         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3771         return res;
3772 }                               /* fabric_rank_from_hcas() */
3773
3774 /***************************************************
3775  * After ranking from HCA's we want to re-rank using
3776  * the roots
3777  ***************************************************/
3778 static int fabric_rerank_using_root(IN ftree_fabric_t * p_ftree,
3779                                     IN cl_list_t* p_ranking_bfs_list)
3780 {
3781         ftree_sw_t *p_sw = NULL;
3782         ftree_sw_t *p_next_sw;
3783         int res;
3784
3785         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3786
3787         p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3788         while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
3789                 p_sw = p_next_sw;
3790                 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
3791                 if (p_sw->rank == 0)
3792                         cl_list_insert_tail(p_ranking_bfs_list, p_sw);
3793                 else
3794                         p_sw->rank = 0xFFFFFFFF;
3795         }
3796         res = fabric_rank_from_roots(p_ftree, p_ranking_bfs_list);
3797         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3798         return res;
3799 }
3800 /***************************************************
3801  ***************************************************/
3802 static int fabric_rank(IN ftree_fabric_t * p_ftree)
3803 {
3804         int res = -1;
3805         cl_list_t ranking_bfs_list;
3806
3807         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3808         cl_list_init(&ranking_bfs_list, 10);
3809
3810         if (fabric_roots_provided(p_ftree) &&
3811             fabric_load_roots(p_ftree, &ranking_bfs_list))
3812                 res = fabric_rank_from_roots(p_ftree, &ranking_bfs_list);
3813         else {
3814                 res = fabric_rank_from_hcas(p_ftree);
3815                 if (!res)
3816                         res = fabric_rerank_using_root(p_ftree, &ranking_bfs_list);
3817         }
3818
3819         if (res)
3820                 goto Exit;
3821
3822         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3823                 "FatTree max switch rank is %u\n", p_ftree->max_switch_rank);
3824
3825 Exit:
3826         cl_list_destroy(&ranking_bfs_list);
3827         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3828         return res;
3829 }                               /* fabric_rank() */
3830
3831 /***************************************************
3832  ***************************************************/
3833
3834 static void fabric_set_leaf_rank(IN ftree_fabric_t * p_ftree)
3835 {
3836         unsigned i;
3837         ftree_sw_t *p_sw;
3838         ftree_hca_t *p_hca = NULL;
3839         ftree_hca_t *p_next_hca;
3840
3841         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3842
3843         if (!fabric_roots_provided(p_ftree)) {
3844                 /* If root file is not provided, the fabric has to be pure fat-tree
3845                    in terms of ranking. Thus, leaf switches rank is the max rank. */
3846                 p_ftree->leaf_switch_rank = p_ftree->max_switch_rank;
3847         } else {
3848                 /* Find the first CN and set the leaf_switch_rank to the rank
3849                    of the switch that is connected to this CN. Later we will
3850                    ensure that all the leaf switches have the same rank. */
3851                 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3852                 while (p_next_hca !=
3853                        (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3854                         p_hca = p_next_hca;
3855                         if (p_hca->cn_num)
3856                                 break;
3857                         p_next_hca =
3858                             (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3859                 }
3860                 /* we know that there are CNs in the fabric, so just to be sure... */
3861                 CL_ASSERT(p_next_hca !=
3862                           (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl));
3863
3864                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3865                         "Selected CN port GUID 0x%" PRIx64 "\n",
3866                         hca_get_guid_ho(p_hca));
3867
3868                 for (i = 0; (i < p_hca->up_port_groups_num)
3869                      && (!p_hca->up_port_groups[i]->is_cn); i++)
3870                         ;
3871                 CL_ASSERT(i < p_hca->up_port_groups_num);
3872                 CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type ==
3873                           IB_NODE_TYPE_SWITCH);
3874
3875                 p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw;
3876                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3877                         "Selected leaf switch GUID 0x%" PRIx64 ", rank %u\n",
3878                         sw_get_guid_ho(p_sw), p_sw->rank);
3879                 p_ftree->leaf_switch_rank = p_sw->rank;
3880         }
3881
3882         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3883                 "FatTree leaf switch rank is %u\n", p_ftree->leaf_switch_rank);
3884         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3885 }                               /* fabric_set_leaf_rank() */
3886
3887 /***************************************************
3888  ***************************************************/
3889
3890 static int fabric_populate_ports(IN ftree_fabric_t * p_ftree)
3891 {
3892         ftree_hca_t *p_hca;
3893         ftree_hca_t *p_next_hca;
3894         ftree_sw_t *p_sw;
3895         ftree_sw_t *p_next_sw;
3896         int res = 0;
3897
3898         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3899
3900         p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3901         while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3902                 p_hca = p_next_hca;
3903                 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3904                 if (fabric_construct_hca_ports(p_ftree, p_hca) != 0) {
3905                         res = -1;
3906                         goto Exit;
3907                 }
3908         }
3909
3910         p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3911         while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
3912                 p_sw = p_next_sw;
3913                 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
3914                 if (fabric_construct_sw_ports(p_ftree, p_sw) != 0) {
3915                         res = -1;
3916                         goto Exit;
3917                 }
3918         }
3919 Exit:
3920         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3921         return res;
3922 }                               /* fabric_populate_ports() */
3923
3924 /***************************************************
3925  ***************************************************/
3926 static int add_guid_item_to_map(void *cxt, uint64_t guid, char *p)
3927 {
3928         cl_qmap_t *map = cxt;
3929         name_map_item_t *item;
3930         name_map_item_t *inserted_item;
3931
3932         item = malloc(sizeof(*item));
3933         if (!item)
3934                 return -1;
3935
3936         item->guid = guid;
3937         inserted_item = (name_map_item_t *) cl_qmap_insert(map, guid, &item->item);
3938         if (inserted_item != item)
3939                 free(item);
3940
3941         return 0;
3942 }
3943
3944 static int fabric_read_guid_files(IN ftree_fabric_t * p_ftree)
3945 {
3946         int status = 0;
3947
3948         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3949
3950         if (fabric_cns_provided(p_ftree)) {
3951                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3952                         "Fetching compute nodes from file %s\n",
3953                         p_ftree->p_osm->subn.opt.cn_guid_file);
3954
3955                 if (parse_node_map(p_ftree->p_osm->subn.opt.cn_guid_file,
3956                                    add_guid_item_to_map,
3957                                    &p_ftree->cn_guid_tbl)) {
3958                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3959                                 "ERR AB23: " "Problem parsing CN guid file\n");
3960                         status = -1;
3961                         goto Exit;
3962                 }
3963
3964                 if (!cl_qmap_count(&p_ftree->cn_guid_tbl)) {
3965                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3966                                 "ERR AB27: "
3967                                 "Compute node guids file has no valid guids\n");
3968                         status = -1;
3969                         goto Exit;
3970                 }
3971         }
3972
3973         if (fabric_ios_provided(p_ftree)) {
3974                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3975                         "Fetching I/O nodes from file %s\n",
3976                         p_ftree->p_osm->subn.opt.io_guid_file);
3977
3978                 if (parse_node_map(p_ftree->p_osm->subn.opt.io_guid_file,
3979                                    add_guid_item_to_map,
3980                                    &p_ftree->io_guid_tbl)) {
3981                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3982                                 "ERR AB28: Problem parsing I/O guid file\n");
3983                         status = -1;
3984                         goto Exit;
3985                 }
3986
3987                 if (!cl_qmap_count(&p_ftree->io_guid_tbl)) {
3988                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3989                                 "ERR AB29: "
3990                                 "I/O node guids file has no valid guids\n");
3991                         status = -1;
3992                         goto Exit;
3993                 }
3994         }
3995 Exit:
3996         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3997         return status;
3998 }                               /*fabric_read_guid_files() */
3999
4000 /***************************************************
4001  ***************************************************/
4002 /* Get a Sw and remove all depended HCA's, meaning all
4003  * HCA's which this is the only switch they are connected
4004  * to   */
4005 static int remove_depended_hca(IN ftree_fabric_t *p_ftree, IN ftree_sw_t *p_sw)
4006 {
4007         ftree_hca_t *p_hca;
4008         int counter = 0;
4009         int port_num;
4010         uint8_t remote_port_num;
4011         osm_physp_t* physp;
4012         osm_node_t* sw_node;
4013         uint64_t remote_hca_guid;
4014
4015         sw_node = p_sw->p_osm_sw->p_node;
4016         for (port_num = 0; port_num < sw_node->physp_tbl_size; port_num++) {
4017                 physp = osm_node_get_physp_ptr(sw_node, port_num);
4018                 if (physp && physp->p_remote_physp) {
4019                         if (osm_node_get_type(physp->p_remote_physp->p_node) == IB_NODE_TYPE_CA) {
4020                                 remote_hca_guid =
4021                                     osm_node_get_node_guid(physp->p_remote_physp->p_node);
4022                                 p_hca = fabric_get_hca_by_guid(p_ftree, remote_hca_guid);
4023                                 if (!p_hca)
4024                                         continue;
4025
4026                                 remote_port_num =
4027                                     osm_physp_get_port_num(physp->p_remote_physp);
4028                                 p_hca->disconnected_ports[remote_port_num] = 1;
4029                         }
4030                 }
4031         }
4032         return counter;
4033 }
4034 /***************************************************
4035  ***************************************************/
4036 static void fabric_remove_unranked_sw(IN ftree_fabric_t *p_ftree)
4037 {
4038         ftree_sw_t *p_sw = NULL;
4039         ftree_sw_t *p_next_sw;
4040         int removed_hca;
4041         int count = 0;
4042
4043         p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
4044         while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
4045                 p_sw = p_next_sw;
4046                 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
4047                 if (!sw_ranked(p_sw)) {
4048                         cl_qmap_remove_item(&p_ftree->sw_tbl,&p_sw->map_item);
4049                         removed_hca = remove_depended_hca(p_ftree, p_sw);
4050                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4051                                 "Removing Unranked sw 0x%" PRIx64 " (with %d dependent hca's)\n",
4052                                 sw_get_guid_ho(p_sw),removed_hca);
4053                         sw_destroy(p_sw);
4054                         count++;
4055                 }
4056         }
4057         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
4058                 "Removed %d invalid switches\n", count);
4059 }
4060 /***************************************************
4061  ***************************************************/
4062 static int construct_fabric(IN void *context)
4063 {
4064         ftree_fabric_t *p_ftree = context;
4065         int status = 0;
4066
4067         OSM_LOG_ENTER(&p_ftree->p_osm->log);
4068
4069         fabric_clear(p_ftree);
4070
4071         if (p_ftree->p_osm->subn.opt.lmc > 0) {
4072                 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4073                            "LMC > 0 is not supported by fat-tree routing.\n"
4074                            "Falling back to default routing\n");
4075                 status = -1;
4076                 goto Exit;
4077         }
4078
4079         if (cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl) < 2) {
4080                 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4081                            "Fabric has %u switches - topology is not fat-tree.\n"
4082                            "Falling back to default routing\n",
4083                            cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl));
4084                 status = -1;
4085                 goto Exit;
4086         }
4087
4088         if ((cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl) -
4089              cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl)) < 2) {
4090                 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4091                            "Fabric has %u nodes (%u switches) - topology is not fat-tree.\n"
4092                            "Falling back to default routing\n",
4093                            cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl),
4094                            cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl));
4095                 status = -1;
4096                 goto Exit;
4097         }
4098
4099         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n"
4100                 "                       |----------------------------------------|\n"
4101                 "                       |- Starting FatTree fabric construction -|\n"
4102                 "                       |----------------------------------------|\n\n");
4103
4104         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4105                 "Populating FatTree Switch and CA tables\n");
4106         if (fabric_populate_nodes(p_ftree) != 0) {
4107                 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4108                            "Fabric topology is not fat-tree - "
4109                            "falling back to default routing\n");
4110                 status = -1;
4111                 goto Exit;
4112         }
4113
4114         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4115                 "Reading guid files provided by user\n");
4116         if (fabric_read_guid_files(p_ftree) != 0) {
4117                 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4118                            "Failed reading guid files - "
4119                            "falling back to default routing\n");
4120                 status = -1;
4121                 goto Exit;
4122         }
4123
4124         if (cl_qmap_count(&p_ftree->hca_tbl) < 2) {
4125                 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4126                            "Fabric has %u CAs - topology is not fat-tree.\n"
4127                            "Falling back to default routing\n",
4128                            cl_qmap_count(&p_ftree->hca_tbl));
4129                 status = -1;
4130                 goto Exit;
4131         }
4132
4133         /* Rank all the switches in the fabric.
4134            After that we will know only fabric max switch rank.
4135            We will be able to check leaf switches rank and the
4136            whole tree rank after filling ports and marking CNs. */
4137         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "Ranking FatTree\n");
4138         if (fabric_rank(p_ftree) != 0) {
4139                 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4140                            "Failed ranking the tree\n");
4141                 status = -1;
4142                 goto Exit;
4143         }
4144         fabric_remove_unranked_sw(p_ftree);
4145
4146         if (p_ftree->max_switch_rank == 0 &&
4147             cl_qmap_count(&p_ftree->sw_tbl) > 1) {
4148                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
4149                         "ERR AB2B: Found more than one root on fabric with "
4150                         "maximum rank 0\n");
4151                 status = -1;
4152                 goto Exit;
4153         }
4154
4155         /* For each hca and switch, construct array of ports.
4156            This is done after the whole FatTree data structure is ready,
4157            because we want the ports to have pointers to ftree_{sw,hca}_t
4158            objects, and we need the switches to be already ranked because
4159            that's how the port direction is determined. */
4160         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4161                 "Populating CA & switch ports\n");
4162         if (fabric_populate_ports(p_ftree) != 0) {
4163                 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4164                            "Fabric topology is not a fat-tree\n");
4165                 status = -1;
4166                 goto Exit;
4167         } else if (p_ftree->cn_num == 0) {
4168                 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4169                            "Fabric has no valid compute nodes\n");
4170                 status = -1;
4171                 goto Exit;
4172         }
4173
4174         /* Now that the CA ports have been created and CNs were marked,
4175            we can complete the fabric ranking - set leaf switches rank. */
4176         fabric_set_leaf_rank(p_ftree);
4177
4178         if (fabric_get_rank(p_ftree) > FAT_TREE_MAX_RANK ||
4179             fabric_get_rank(p_ftree) < FAT_TREE_MIN_RANK) {
4180                 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4181                            "Fabric rank is %u (should be between %u and %u)\n",
4182                            fabric_get_rank(p_ftree), FAT_TREE_MIN_RANK,
4183                            FAT_TREE_MAX_RANK);
4184                 status = -1;
4185                 goto Exit;
4186         }
4187
4188         /* Mark all the switches in the fabric with rank equal to
4189            p_ftree->leaf_switch_rank and that are also connected to CNs.
4190            As a by-product, this function also runs basic topology
4191            validation - it checks that all the CNs are at the same rank. */
4192         if (fabric_mark_leaf_switches(p_ftree)) {
4193                 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4194                            "Fabric topology is not a fat-tree\n");
4195                 status = -1;
4196                 goto Exit;
4197         }
4198
4199         /* Assign index to all the switches in the fabric.
4200            This function also sorts leaf switch array by the switch index,
4201            sorts all the port arrays of the indexed switches by remote
4202            switch index, and creates switch-by-tuple table (sw_by_tuple_tbl) */
4203         fabric_make_indexing(p_ftree);
4204
4205         /* Create leaf switch array sorted by index.
4206            This array contains switches with rank equal to p_ftree->leaf_switch_rank
4207            and that are also connected to CNs (REAL leafs), and it may contain
4208            switches at the same leaf rank w/o CNs, if this is the order of indexing.
4209            In any case, the first and the last switches in the array are REAL leafs. */
4210         if (fabric_create_leaf_switch_array(p_ftree)) {
4211                 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4212                            "Fabric topology is not a fat-tree\n");
4213                 status = -1;
4214                 goto Exit;
4215         }
4216
4217         /* calculate and set ftree.max_cn_per_leaf field */
4218         fabric_set_max_cn_per_leaf(p_ftree);
4219
4220         /* print general info about fabric topology */
4221         fabric_dump_general_info(p_ftree);
4222
4223         /* dump full tree topology */
4224         if (OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
4225                 fabric_dump(p_ftree);
4226
4227         /* the fabric is required to be PURE fat-tree only if the root
4228            guid file hasn't been provided by user */
4229         if (!fabric_roots_provided(p_ftree) &&
4230             !fabric_validate_topology(p_ftree)) {
4231                 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4232                            "Fabric topology is not a fat-tree\n");
4233                 status = -1;
4234                 goto Exit;
4235         }
4236
4237         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4238                 "Max LID in switch LFTs: %u\n", p_ftree->lft_max_lid);
4239
4240         /* Build the full lid matrices needed for multicast routing */
4241         osm_ucast_mgr_build_lid_matrices(&p_ftree->p_osm->sm.ucast_mgr);
4242
4243 Exit:
4244         if (status != 0) {
4245                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4246                         "Clearing FatTree Fabric data structures\n");
4247                 fabric_clear(p_ftree);
4248         } else
4249                 p_ftree->fabric_built = TRUE;
4250
4251         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n"
4252                 "                       |--------------------------------------------------|\n"
4253                 "                       |- Done constructing FatTree fabric (status = %d) -|\n"
4254                 "                       |--------------------------------------------------|\n\n",
4255                 status);
4256
4257         OSM_LOG_EXIT(&p_ftree->p_osm->log);
4258         return status;
4259 }                               /* construct_fabric() */
4260
4261 /***************************************************
4262  ***************************************************/
4263
4264 static int do_routing(IN void *context)
4265 {
4266         ftree_fabric_t *p_ftree = context;
4267         int status = 0;
4268
4269         OSM_LOG_ENTER(&p_ftree->p_osm->log);
4270
4271         if (!p_ftree->fabric_built) {
4272                 status = -1;
4273                 goto Exit;
4274         }
4275
4276         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4277                 "Starting FatTree routing\n");
4278
4279         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4280                 "Filling switch forwarding tables for Compute Nodes\n");
4281         fabric_route_to_cns(p_ftree);
4282
4283         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4284                 "Filling switch forwarding tables for non-CN targets\n");
4285         fabric_route_to_non_cns(p_ftree);
4286
4287         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4288                 "Filling switch forwarding tables for switch-to-switch paths\n");
4289         fabric_route_to_switches(p_ftree);
4290
4291         if (p_ftree->p_osm->subn.opt.connect_roots) {
4292                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4293                         "Connecting switches that are unreachable within "
4294                         "Up/Down rules\n");
4295                 fabric_route_roots(p_ftree);
4296         }
4297
4298         /* for each switch, set its fwd table */
4299         cl_qmap_apply_func(&p_ftree->sw_tbl, set_sw_fwd_table, (void *)p_ftree);
4300
4301         /* write out hca ordering file */
4302         fabric_dump_hca_ordering(p_ftree);
4303
4304         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4305                 "FatTree routing is done\n");
4306
4307 Exit:
4308         OSM_LOG_EXIT(&p_ftree->p_osm->log);
4309         return status;
4310 }
4311
4312 /***************************************************
4313  ***************************************************/
4314
4315 static void delete(IN void *context)
4316 {
4317         if (!context)
4318                 return;
4319         fabric_destroy((ftree_fabric_t *) context);
4320 }
4321
4322 /***************************************************
4323  ***************************************************/
4324
4325 int osm_ucast_ftree_setup(struct osm_routing_engine *r, osm_opensm_t * p_osm)
4326 {
4327         ftree_fabric_t *p_ftree = fabric_create();
4328         if (!p_ftree)
4329                 return -1;
4330
4331         p_ftree->p_osm = p_osm;
4332         p_ftree->p_subn = p_osm->sm.ucast_mgr.p_subn;
4333
4334         r->context = (void *)p_ftree;
4335         r->build_lid_matrices = construct_fabric;
4336         r->ucast_build_fwd_tables = do_routing;
4337         r->destroy = delete;
4338
4339         return 0;
4340 }