]> CyberLeo.Net >> Repos - FreeBSD/releng/10.0.git/blob - contrib/ofed/management/opensm/opensm/osm_ucast_ftree.c
- Copy stable/10 (r259064) to releng/10.0 as part of the
[FreeBSD/releng/10.0.git] / contrib / ofed / management / opensm / opensm / osm_ucast_ftree.c
1 /*
2  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
3  * Copyright (c) 2002-2007 Mellanox Technologies LTD. All rights reserved.
4  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * OpenIB.org BSD license below:
11  *
12  *     Redistribution and use in source and binary forms, with or
13  *     without modification, are permitted provided that the following
14  *     conditions are met:
15  *
16  *      - Redistributions of source code must retain the above
17  *        copyright notice, this list of conditions and the following
18  *        disclaimer.
19  *
20  *      - Redistributions in binary form must reproduce the above
21  *        copyright notice, this list of conditions and the following
22  *        disclaimer in the documentation and/or other materials
23  *        provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  *
34  */
35
36 /*
37  * Abstract:
38  *    Implementation of OpenSM FatTree routing
39  */
40
41 #if HAVE_CONFIG_H
42 #  include <config.h>
43 #endif
44
45 #include <stdlib.h>
46 #include <string.h>
47 #include <ctype.h>
48 #include <errno.h>
49 #include <iba/ib_types.h>
50 #include <complib/cl_qmap.h>
51 #include <complib/cl_debug.h>
52 #include <opensm/osm_opensm.h>
53 #include <opensm/osm_switch.h>
54
55 /*
56  * FatTree rank is bounded between 2 and 8:
57  *  - Tree of rank 1 has only trivial routing paths,
58  *    so no need to use FatTree routing.
59  *  - Why maximum rank is 8:
60  *    Each node (switch) is assigned a unique tuple.
61  *    Switches are stored in two cl_qmaps - one is
62  *    ordered by guid, and the other by a key that is
63  *    generated from tuple. Since cl_qmap supports only
64  *    a 64-bit key, the maximal tuple lenght is 8 bytes.
65  *    which means that maximal tree rank is 8.
66  * Note that the above also implies that each switch
67  * can have at max 255 up/down ports.
68  */
69
70 #define FAT_TREE_MIN_RANK 2
71 #define FAT_TREE_MAX_RANK 8
72
73 typedef enum {
74         FTREE_DIRECTION_DOWN = -1,
75         FTREE_DIRECTION_SAME,
76         FTREE_DIRECTION_UP
77 } ftree_direction_t;
78
79 /***************************************************
80  **
81  **  Forward references
82  **
83  ***************************************************/
84
85 struct ftree_sw_t_;
86 struct ftree_hca_t_;
87 struct ftree_port_t_;
88 struct ftree_port_group_t_;
89 struct ftree_fabric_t_;
90
91 /***************************************************
92  **
93  **  ftree_tuple_t definition
94  **
95  ***************************************************/
96
97 #define FTREE_TUPLE_BUFF_LEN 1024
98 #define FTREE_TUPLE_LEN 8
99
100 typedef uint8_t ftree_tuple_t[FTREE_TUPLE_LEN];
101 typedef uint64_t ftree_tuple_key_t;
102
103 struct guid_list_item {
104         cl_list_item_t list;
105         uint64_t guid;
106 };
107
108 /***************************************************
109  **
110  **  ftree_sw_table_element_t definition
111  **
112  ***************************************************/
113
114 typedef struct {
115         cl_map_item_t map_item;
116         struct ftree_sw_t_ *p_sw;
117 } ftree_sw_tbl_element_t;
118
119 /***************************************************
120  **
121  **  ftree_port_t definition
122  **
123  ***************************************************/
124
125 typedef struct ftree_port_t_ {
126         cl_map_item_t map_item;
127         uint8_t port_num;       /* port number on the current node */
128         uint8_t remote_port_num;        /* port number on the remote node */
129         uint32_t counter_up;    /* number of allocated routs upwards */
130         uint32_t counter_down;  /* number of allocated routs downwards */
131 } ftree_port_t;
132
133 /***************************************************
134  **
135  **  ftree_port_group_t definition
136  **
137  ***************************************************/
138
139 typedef union ftree_hca_or_sw_ {
140         struct ftree_hca_t_ *p_hca;
141         struct ftree_sw_t_ *p_sw;
142 } ftree_hca_or_sw;
143
144 typedef struct ftree_port_group_t_ {
145         cl_map_item_t map_item;
146         ib_net16_t base_lid;    /* base lid of the current node */
147         ib_net16_t remote_base_lid;     /* base lid of the remote node */
148         ib_net64_t port_guid;   /* port guid of this port */
149         ib_net64_t node_guid;   /* this node's guid */
150         uint8_t node_type;      /* this node's type */
151         ib_net64_t remote_port_guid;    /* port guid of the remote port */
152         ib_net64_t remote_node_guid;    /* node guid of the remote node */
153         uint8_t remote_node_type;       /* IB_NODE_TYPE_{CA,SWITCH,ROUTER,...} */
154         ftree_hca_or_sw hca_or_sw;      /* pointer to this hca/switch */
155         ftree_hca_or_sw remote_hca_or_sw;       /* pointer to remote hca/switch */
156         cl_ptr_vector_t ports;  /* vector of ports to the same lid */
157         boolean_t is_cn;        /* whether this port is a compute node */
158         uint32_t counter_down;  /* number of allocated routs downwards */
159 } ftree_port_group_t;
160
161 /***************************************************
162  **
163  **  ftree_sw_t definition
164  **
165  ***************************************************/
166
167 typedef struct ftree_sw_t_ {
168         cl_map_item_t map_item;
169         osm_switch_t *p_osm_sw;
170         uint32_t rank;
171         ftree_tuple_t tuple;
172         ib_net16_t base_lid;
173         ftree_port_group_t **down_port_groups;
174         uint8_t down_port_groups_num;
175         ftree_port_group_t **up_port_groups;
176         uint8_t up_port_groups_num;
177         boolean_t is_leaf;
178         int down_port_groups_idx;
179 } ftree_sw_t;
180
181 /***************************************************
182  **
183  **  ftree_hca_t definition
184  **
185  ***************************************************/
186
187 typedef struct ftree_hca_t_ {
188         cl_map_item_t map_item;
189         osm_node_t *p_osm_node;
190         ftree_port_group_t **up_port_groups;
191         uint16_t up_port_groups_num;
192         unsigned cn_num;
193 } ftree_hca_t;
194
195 /***************************************************
196  **
197  **  ftree_fabric_t definition
198  **
199  ***************************************************/
200
201 typedef struct ftree_fabric_t_ {
202         osm_opensm_t *p_osm;
203         cl_qmap_t hca_tbl;
204         cl_qmap_t sw_tbl;
205         cl_qmap_t sw_by_tuple_tbl;
206         cl_qlist_t root_guid_list;
207         cl_qmap_t cn_guid_tbl;
208         unsigned cn_num;
209         uint8_t leaf_switch_rank;
210         uint8_t max_switch_rank;
211         ftree_sw_t **leaf_switches;
212         uint32_t leaf_switches_num;
213         uint16_t max_cn_per_leaf;
214         uint16_t lft_max_lid_ho;
215         boolean_t fabric_built;
216 } ftree_fabric_t;
217
218 /***************************************************
219  **
220  ** comparators
221  **
222  ***************************************************/
223
224 static int OSM_CDECL __osm_ftree_compare_switches_by_index(IN const void *p1,
225                                                            IN const void *p2)
226 {
227         ftree_sw_t **pp_sw1 = (ftree_sw_t **) p1;
228         ftree_sw_t **pp_sw2 = (ftree_sw_t **) p2;
229
230         uint16_t i;
231         for (i = 0; i < FTREE_TUPLE_LEN; i++) {
232                 if ((*pp_sw1)->tuple[i] > (*pp_sw2)->tuple[i])
233                         return 1;
234                 if ((*pp_sw1)->tuple[i] < (*pp_sw2)->tuple[i])
235                         return -1;
236         }
237         return 0;
238 }
239
240 /***************************************************/
241
242 static int OSM_CDECL
243 __osm_ftree_compare_port_groups_by_remote_switch_index(IN const void *p1,
244                                                        IN const void *p2)
245 {
246         ftree_port_group_t **pp_g1 = (ftree_port_group_t **) p1;
247         ftree_port_group_t **pp_g2 = (ftree_port_group_t **) p2;
248
249         return
250             __osm_ftree_compare_switches_by_index(&
251                                                   ((*pp_g1)->remote_hca_or_sw.
252                                                    p_sw),
253                                                   &((*pp_g2)->remote_hca_or_sw.
254                                                     p_sw));
255 }
256
257 /***************************************************
258  **
259  ** ftree_tuple_t functions
260  **
261  ***************************************************/
262
263 static void __osm_ftree_tuple_init(IN ftree_tuple_t tuple)
264 {
265         memset(tuple, 0xFF, FTREE_TUPLE_LEN);
266 }
267
268 /***************************************************/
269
270 static inline boolean_t __osm_ftree_tuple_assigned(IN ftree_tuple_t tuple)
271 {
272         return (tuple[0] != 0xFF);
273 }
274
275 /***************************************************/
276
277 #define FTREE_TUPLE_BUFFERS_NUM 6
278
279 static char *__osm_ftree_tuple_to_str(IN ftree_tuple_t tuple)
280 {
281         static char buffer[FTREE_TUPLE_BUFFERS_NUM][FTREE_TUPLE_BUFF_LEN];
282         static uint8_t ind = 0;
283         char *ret_buffer;
284         uint32_t i;
285
286         if (!__osm_ftree_tuple_assigned(tuple))
287                 return "INDEX.NOT.ASSIGNED";
288
289         buffer[ind][0] = '\0';
290
291         for (i = 0; (i < FTREE_TUPLE_LEN) && (tuple[i] != 0xFF); i++) {
292                 if ((strlen(buffer[ind]) + 10) > FTREE_TUPLE_BUFF_LEN)
293                         return "INDEX.TOO.LONG";
294                 if (i != 0)
295                         strcat(buffer[ind], ".");
296                 sprintf(&buffer[ind][strlen(buffer[ind])], "%u", tuple[i]);
297         }
298
299         ret_buffer = buffer[ind];
300         ind = (ind + 1) % FTREE_TUPLE_BUFFERS_NUM;
301         return ret_buffer;
302 }                               /* __osm_ftree_tuple_to_str() */
303
304 /***************************************************/
305
306 static inline ftree_tuple_key_t __osm_ftree_tuple_to_key(IN ftree_tuple_t tuple)
307 {
308         ftree_tuple_key_t key;
309         memcpy(&key, tuple, FTREE_TUPLE_LEN);
310         return key;
311 }
312
313 /***************************************************/
314
315 static inline void __osm_ftree_tuple_from_key(IN ftree_tuple_t tuple,
316                                               IN ftree_tuple_key_t key)
317 {
318         memcpy(tuple, &key, FTREE_TUPLE_LEN);
319 }
320
321 /***************************************************
322  **
323  ** ftree_sw_tbl_element_t functions
324  **
325  ***************************************************/
326
327 static ftree_sw_tbl_element_t *__osm_ftree_sw_tbl_element_create(IN ftree_sw_t *
328                                                                  p_sw)
329 {
330         ftree_sw_tbl_element_t *p_element =
331             (ftree_sw_tbl_element_t *) malloc(sizeof(ftree_sw_tbl_element_t));
332         if (!p_element)
333                 return NULL;
334         memset(p_element, 0, sizeof(ftree_sw_tbl_element_t));
335
336         p_element->p_sw = p_sw;
337         return p_element;
338 }
339
340 /***************************************************/
341
342 static void __osm_ftree_sw_tbl_element_destroy(IN ftree_sw_tbl_element_t *
343                                                p_element)
344 {
345         if (!p_element)
346                 return;
347         free(p_element);
348 }
349
350 /***************************************************
351  **
352  ** ftree_port_t functions
353  **
354  ***************************************************/
355
356 static ftree_port_t *__osm_ftree_port_create(IN uint8_t port_num,
357                                              IN uint8_t remote_port_num)
358 {
359         ftree_port_t *p_port = (ftree_port_t *) malloc(sizeof(ftree_port_t));
360         if (!p_port)
361                 return NULL;
362         memset(p_port, 0, sizeof(ftree_port_t));
363
364         p_port->port_num = port_num;
365         p_port->remote_port_num = remote_port_num;
366
367         return p_port;
368 }
369
370 /***************************************************/
371
372 static void __osm_ftree_port_destroy(IN ftree_port_t * p_port)
373 {
374         if (p_port)
375                 free(p_port);
376 }
377
378 /***************************************************
379  **
380  ** ftree_port_group_t functions
381  **
382  ***************************************************/
383
384 static ftree_port_group_t *
385 __osm_ftree_port_group_create(IN ib_net16_t base_lid,
386                               IN ib_net16_t remote_base_lid,
387                               IN ib_net64_t port_guid,
388                               IN ib_net64_t node_guid,
389                               IN uint8_t node_type,
390                               IN void *p_hca_or_sw,
391                               IN ib_net64_t remote_port_guid,
392                               IN ib_net64_t remote_node_guid,
393                               IN uint8_t remote_node_type,
394                               IN void *p_remote_hca_or_sw,
395                               IN boolean_t is_cn)
396 {
397         ftree_port_group_t *p_group =
398             (ftree_port_group_t *) malloc(sizeof(ftree_port_group_t));
399         if (p_group == NULL)
400                 return NULL;
401         memset(p_group, 0, sizeof(ftree_port_group_t));
402
403         p_group->base_lid = base_lid;
404         p_group->remote_base_lid = remote_base_lid;
405         memcpy(&p_group->port_guid, &port_guid, sizeof(ib_net64_t));
406         memcpy(&p_group->node_guid, &node_guid, sizeof(ib_net64_t));
407         memcpy(&p_group->remote_port_guid, &remote_port_guid,
408                sizeof(ib_net64_t));
409         memcpy(&p_group->remote_node_guid, &remote_node_guid,
410                sizeof(ib_net64_t));
411
412         p_group->node_type = node_type;
413         switch (node_type) {
414         case IB_NODE_TYPE_CA:
415                 p_group->hca_or_sw.p_hca = (ftree_hca_t *) p_hca_or_sw;
416                 break;
417         case IB_NODE_TYPE_SWITCH:
418                 p_group->hca_or_sw.p_sw = (ftree_sw_t *) p_hca_or_sw;
419                 break;
420         default:
421                 /* we shouldn't get here - port is created only in hca or switch */
422                 CL_ASSERT(0);
423         }
424
425         p_group->remote_node_type = remote_node_type;
426         switch (remote_node_type) {
427         case IB_NODE_TYPE_CA:
428                 p_group->remote_hca_or_sw.p_hca =
429                     (ftree_hca_t *) p_remote_hca_or_sw;
430                 break;
431         case IB_NODE_TYPE_SWITCH:
432                 p_group->remote_hca_or_sw.p_sw =
433                     (ftree_sw_t *) p_remote_hca_or_sw;
434                 break;
435         default:
436                 /* we shouldn't get here - port is created only in hca or switch */
437                 CL_ASSERT(0);
438         }
439
440         cl_ptr_vector_init(&p_group->ports, 0,  /* min size */
441                            8);  /* grow size */
442         p_group->is_cn = is_cn;
443         return p_group;
444 }                               /* __osm_ftree_port_group_create() */
445
446 /***************************************************/
447
448 static void __osm_ftree_port_group_destroy(IN ftree_port_group_t * p_group)
449 {
450         uint32_t i;
451         uint32_t size;
452         ftree_port_t *p_port;
453
454         if (!p_group)
455                 return;
456
457         /* remove all the elements of p_group->ports vector */
458         size = cl_ptr_vector_get_size(&p_group->ports);
459         for (i = 0; i < size; i++) {
460                 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
461                 __osm_ftree_port_destroy(p_port);
462         }
463         cl_ptr_vector_destroy(&p_group->ports);
464         free(p_group);
465 }                               /* __osm_ftree_port_group_destroy() */
466
467 /***************************************************/
468
469 static void
470 __osm_ftree_port_group_dump(IN ftree_fabric_t * p_ftree,
471                             IN ftree_port_group_t * p_group,
472                             IN ftree_direction_t direction)
473 {
474         ftree_port_t *p_port;
475         uint32_t size;
476         uint32_t i;
477         char buff[10 * 1024];
478
479         if (!p_group)
480                 return;
481
482         if (!osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
483                 return;
484
485         size = cl_ptr_vector_get_size(&p_group->ports);
486         buff[0] = '\0';
487
488         for (i = 0; i < size; i++) {
489                 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
490                 CL_ASSERT(p_port);
491
492                 if (i != 0)
493                         strcat(buff, ", ");
494                 sprintf(buff + strlen(buff), "%u", p_port->port_num);
495         }
496
497         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
498                 "    Port Group of size %u, port(s): %s, direction: %s\n"
499                 "                  Local <--> Remote GUID (LID):"
500                 "0x%016" PRIx64 " (0x%04x) <--> 0x%016" PRIx64 " (0x%04x)\n",
501                 size,
502                 buff,
503                 (direction == FTREE_DIRECTION_DOWN) ? "DOWN" : "UP",
504                 cl_ntoh64(p_group->port_guid),
505                 cl_ntoh16(p_group->base_lid),
506                 cl_ntoh64(p_group->remote_port_guid),
507                 cl_ntoh16(p_group->remote_base_lid));
508
509 }                               /* __osm_ftree_port_group_dump() */
510
511 /***************************************************/
512
513 static void
514 __osm_ftree_port_group_add_port(IN ftree_port_group_t * p_group,
515                                 IN uint8_t port_num, IN uint8_t remote_port_num)
516 {
517         uint16_t i;
518         ftree_port_t *p_port;
519
520         for (i = 0; i < cl_ptr_vector_get_size(&p_group->ports); i++) {
521                 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
522                 if (p_port->port_num == port_num)
523                         return;
524         }
525
526         p_port = __osm_ftree_port_create(port_num, remote_port_num);
527         cl_ptr_vector_insert(&p_group->ports, p_port, NULL);
528 }
529
530 /***************************************************
531  **
532  ** ftree_sw_t functions
533  **
534  ***************************************************/
535
536 static ftree_sw_t *__osm_ftree_sw_create(IN ftree_fabric_t * p_ftree,
537                                          IN osm_switch_t * p_osm_sw)
538 {
539         ftree_sw_t *p_sw;
540         uint8_t ports_num;
541
542         /* make sure that the switch has ports */
543         if (p_osm_sw->num_ports == 1)
544                 return NULL;
545
546         p_sw = (ftree_sw_t *) malloc(sizeof(ftree_sw_t));
547         if (p_sw == NULL)
548                 return NULL;
549         memset(p_sw, 0, sizeof(ftree_sw_t));
550
551         p_sw->p_osm_sw = p_osm_sw;
552         p_sw->rank = 0xFFFFFFFF;
553         __osm_ftree_tuple_init(p_sw->tuple);
554
555         p_sw->base_lid = osm_node_get_base_lid(p_sw->p_osm_sw->p_node, 0);
556
557         ports_num = osm_node_get_num_physp(p_sw->p_osm_sw->p_node);
558         p_sw->down_port_groups =
559             (ftree_port_group_t **) malloc(ports_num *
560                                            sizeof(ftree_port_group_t *));
561         p_sw->up_port_groups =
562             (ftree_port_group_t **) malloc(ports_num *
563                                            sizeof(ftree_port_group_t *));
564         if (!p_sw->down_port_groups || !p_sw->up_port_groups)
565                 return NULL;
566         p_sw->down_port_groups_num = 0;
567         p_sw->up_port_groups_num = 0;
568
569         /* initialize lft buffer */
570         memset(p_osm_sw->new_lft, OSM_NO_PATH, IB_LID_UCAST_END_HO + 1);
571
572         p_sw->down_port_groups_idx = -1;
573
574         return p_sw;
575 }                               /* __osm_ftree_sw_create() */
576
577 /***************************************************/
578
579 static void __osm_ftree_sw_destroy(IN ftree_fabric_t * p_ftree,
580                                    IN ftree_sw_t * p_sw)
581 {
582         uint8_t i;
583
584         if (!p_sw)
585                 return;
586
587         for (i = 0; i < p_sw->down_port_groups_num; i++)
588                 __osm_ftree_port_group_destroy(p_sw->down_port_groups[i]);
589         for (i = 0; i < p_sw->up_port_groups_num; i++)
590                 __osm_ftree_port_group_destroy(p_sw->up_port_groups[i]);
591         if (p_sw->down_port_groups)
592                 free(p_sw->down_port_groups);
593         if (p_sw->up_port_groups)
594                 free(p_sw->up_port_groups);
595
596         free(p_sw);
597 }                               /* __osm_ftree_sw_destroy() */
598
599 /***************************************************/
600
601 static uint64_t __osm_ftree_sw_get_guid_no(IN ftree_sw_t * p_sw)
602 {
603         if (!p_sw)
604                 return 0;
605         return osm_node_get_node_guid(p_sw->p_osm_sw->p_node);
606 }
607
608 /***************************************************/
609
610 static uint64_t __osm_ftree_sw_get_guid_ho(IN ftree_sw_t * p_sw)
611 {
612         return cl_ntoh64(__osm_ftree_sw_get_guid_no(p_sw));
613 }
614
615 /***************************************************/
616
617 static void __osm_ftree_sw_dump(IN ftree_fabric_t * p_ftree,
618                                 IN ftree_sw_t * p_sw)
619 {
620         uint32_t i;
621
622         if (!p_sw)
623                 return;
624
625         if (!osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
626                 return;
627
628         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
629                 "Switch index: %s, GUID: 0x%016" PRIx64
630                 ", Ports: %u DOWN, %u UP\n",
631                 __osm_ftree_tuple_to_str(p_sw->tuple),
632                 __osm_ftree_sw_get_guid_ho(p_sw), p_sw->down_port_groups_num,
633                 p_sw->up_port_groups_num);
634
635         for (i = 0; i < p_sw->down_port_groups_num; i++)
636                 __osm_ftree_port_group_dump(p_ftree,
637                                             p_sw->down_port_groups[i],
638                                             FTREE_DIRECTION_DOWN);
639         for (i = 0; i < p_sw->up_port_groups_num; i++)
640                 __osm_ftree_port_group_dump(p_ftree, p_sw->up_port_groups[i],
641                                             FTREE_DIRECTION_UP);
642
643 }                               /* __osm_ftree_sw_dump() */
644
645 /***************************************************/
646
647 static boolean_t __osm_ftree_sw_ranked(IN ftree_sw_t * p_sw)
648 {
649         return (p_sw->rank != 0xFFFFFFFF);
650 }
651
652 /***************************************************/
653
654 static ftree_port_group_t *
655 __osm_ftree_sw_get_port_group_by_remote_lid(IN ftree_sw_t * p_sw,
656                                             IN ib_net16_t remote_base_lid,
657                                             IN ftree_direction_t direction)
658 {
659         uint32_t i;
660         uint32_t size;
661         ftree_port_group_t **port_groups;
662
663         if (direction == FTREE_DIRECTION_UP) {
664                 port_groups = p_sw->up_port_groups;
665                 size = p_sw->up_port_groups_num;
666         } else {
667                 port_groups = p_sw->down_port_groups;
668                 size = p_sw->down_port_groups_num;
669         }
670
671         for (i = 0; i < size; i++)
672                 if (remote_base_lid == port_groups[i]->remote_base_lid)
673                         return port_groups[i];
674
675         return NULL;
676 }                               /* __osm_ftree_sw_get_port_group_by_remote_lid() */
677
678 /***************************************************/
679
680 static void
681 __osm_ftree_sw_add_port(IN ftree_sw_t * p_sw,
682                         IN uint8_t port_num,
683                         IN uint8_t remote_port_num,
684                         IN ib_net16_t base_lid,
685                         IN ib_net16_t remote_base_lid,
686                         IN ib_net64_t port_guid,
687                         IN ib_net64_t remote_port_guid,
688                         IN ib_net64_t remote_node_guid,
689                         IN uint8_t remote_node_type,
690                         IN void *p_remote_hca_or_sw,
691                         IN ftree_direction_t direction)
692 {
693         ftree_port_group_t *p_group =
694             __osm_ftree_sw_get_port_group_by_remote_lid(p_sw, remote_base_lid,
695                                                         direction);
696
697         if (!p_group) {
698                 p_group = __osm_ftree_port_group_create(base_lid,
699                                                         remote_base_lid,
700                                                         port_guid,
701                                                         __osm_ftree_sw_get_guid_no
702                                                         (p_sw),
703                                                         IB_NODE_TYPE_SWITCH,
704                                                         p_sw, remote_port_guid,
705                                                         remote_node_guid,
706                                                         remote_node_type,
707                                                         p_remote_hca_or_sw,
708                                                         FALSE);
709                 CL_ASSERT(p_group);
710
711                 if (direction == FTREE_DIRECTION_UP)
712                         p_sw->up_port_groups[p_sw->up_port_groups_num++] =
713                             p_group;
714                 else
715                         p_sw->down_port_groups[p_sw->down_port_groups_num++] =
716                             p_group;
717         }
718         __osm_ftree_port_group_add_port(p_group, port_num, remote_port_num);
719
720 }                               /* __osm_ftree_sw_add_port() */
721
722 /***************************************************/
723
724 static inline cl_status_t
725 __osm_ftree_sw_set_hops(IN ftree_sw_t * p_sw,
726                         IN uint16_t lid_ho, IN uint8_t port_num,
727                         IN uint8_t hops)
728 {
729         /* set local min hop table(LID) */
730         return osm_switch_set_hops(p_sw->p_osm_sw, lid_ho, port_num, hops);
731 }
732
733 /***************************************************
734  **
735  ** ftree_hca_t functions
736  **
737  ***************************************************/
738
739 static ftree_hca_t *__osm_ftree_hca_create(IN osm_node_t * p_osm_node)
740 {
741         ftree_hca_t *p_hca = (ftree_hca_t *) malloc(sizeof(ftree_hca_t));
742         if (p_hca == NULL)
743                 return NULL;
744         memset(p_hca, 0, sizeof(ftree_hca_t));
745
746         p_hca->p_osm_node = p_osm_node;
747         p_hca->up_port_groups = (ftree_port_group_t **)
748             malloc(osm_node_get_num_physp(p_hca->p_osm_node) *
749                    sizeof(ftree_port_group_t *));
750         if (!p_hca->up_port_groups)
751                 return NULL;
752         p_hca->up_port_groups_num = 0;
753         return p_hca;
754 }
755
756 /***************************************************/
757
758 static void __osm_ftree_hca_destroy(IN ftree_hca_t * p_hca)
759 {
760         uint32_t i;
761
762         if (!p_hca)
763                 return;
764
765         for (i = 0; i < p_hca->up_port_groups_num; i++)
766                 __osm_ftree_port_group_destroy(p_hca->up_port_groups[i]);
767
768         if (p_hca->up_port_groups)
769                 free(p_hca->up_port_groups);
770
771         free(p_hca);
772 }
773
774 /***************************************************/
775
776 static uint64_t __osm_ftree_hca_get_guid_no(IN ftree_hca_t * p_hca)
777 {
778         if (!p_hca)
779                 return 0;
780         return osm_node_get_node_guid(p_hca->p_osm_node);
781 }
782
783 /***************************************************/
784
785 static uint64_t __osm_ftree_hca_get_guid_ho(IN ftree_hca_t * p_hca)
786 {
787         return cl_ntoh64(__osm_ftree_hca_get_guid_no(p_hca));
788 }
789
790 /***************************************************/
791
792 static void __osm_ftree_hca_dump(IN ftree_fabric_t * p_ftree,
793                                  IN ftree_hca_t * p_hca)
794 {
795         uint32_t i;
796
797         if (!p_hca)
798                 return;
799
800         if (!osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
801                 return;
802
803         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
804                 "CA GUID: 0x%016" PRIx64 ", Ports: %u UP\n",
805                 __osm_ftree_hca_get_guid_ho(p_hca), p_hca->up_port_groups_num);
806
807         for (i = 0; i < p_hca->up_port_groups_num; i++)
808                 __osm_ftree_port_group_dump(p_ftree, p_hca->up_port_groups[i],
809                                             FTREE_DIRECTION_UP);
810 }
811
812 /***************************************************/
813
814 static ftree_port_group_t *
815 __osm_ftree_hca_get_port_group_by_remote_lid(IN ftree_hca_t * p_hca,
816                                              IN ib_net16_t remote_base_lid)
817 {
818         uint32_t i;
819         for (i = 0; i < p_hca->up_port_groups_num; i++)
820                 if (remote_base_lid ==
821                     p_hca->up_port_groups[i]->remote_base_lid)
822                         return p_hca->up_port_groups[i];
823
824         return NULL;
825 }
826
827 /***************************************************/
828
829 static void
830 __osm_ftree_hca_add_port(IN ftree_hca_t * p_hca,
831                          IN uint8_t port_num,
832                          IN uint8_t remote_port_num,
833                          IN ib_net16_t base_lid,
834                          IN ib_net16_t remote_base_lid,
835                          IN ib_net64_t port_guid,
836                          IN ib_net64_t remote_port_guid,
837                          IN ib_net64_t remote_node_guid,
838                          IN uint8_t remote_node_type,
839                          IN void *p_remote_hca_or_sw, IN boolean_t is_cn)
840 {
841         ftree_port_group_t *p_group;
842
843         /* this function is supposed to be called only for adding ports
844            in hca's that lead to switches */
845         CL_ASSERT(remote_node_type == IB_NODE_TYPE_SWITCH);
846
847         p_group =
848             __osm_ftree_hca_get_port_group_by_remote_lid(p_hca,
849                                                          remote_base_lid);
850
851         if (!p_group) {
852                 p_group = __osm_ftree_port_group_create(base_lid,
853                                                         remote_base_lid,
854                                                         port_guid,
855                                                         __osm_ftree_hca_get_guid_no
856                                                         (p_hca),
857                                                         IB_NODE_TYPE_CA, p_hca,
858                                                         remote_port_guid,
859                                                         remote_node_guid,
860                                                         remote_node_type,
861                                                         p_remote_hca_or_sw,
862                                                         is_cn);
863                 p_hca->up_port_groups[p_hca->up_port_groups_num++] = p_group;
864         }
865         __osm_ftree_port_group_add_port(p_group, port_num, remote_port_num);
866
867 }                               /* __osm_ftree_hca_add_port() */
868
869 /***************************************************
870  **
871  ** ftree_fabric_t functions
872  **
873  ***************************************************/
874
875 static ftree_fabric_t *__osm_ftree_fabric_create()
876 {
877         ftree_fabric_t *p_ftree =
878             (ftree_fabric_t *) malloc(sizeof(ftree_fabric_t));
879         if (p_ftree == NULL)
880                 return NULL;
881
882         memset(p_ftree, 0, sizeof(ftree_fabric_t));
883
884         cl_qmap_init(&p_ftree->hca_tbl);
885         cl_qmap_init(&p_ftree->sw_tbl);
886         cl_qmap_init(&p_ftree->sw_by_tuple_tbl);
887         cl_qmap_init(&p_ftree->cn_guid_tbl);
888
889         cl_qlist_init(&p_ftree->root_guid_list);
890
891         return p_ftree;
892 }
893
894 /***************************************************/
895
896 static void __osm_ftree_fabric_clear(ftree_fabric_t * p_ftree)
897 {
898         ftree_hca_t *p_hca;
899         ftree_hca_t *p_next_hca;
900         ftree_sw_t *p_sw;
901         ftree_sw_t *p_next_sw;
902         ftree_sw_tbl_element_t *p_element;
903         ftree_sw_tbl_element_t *p_next_element;
904         name_map_item_t *p_guid_element, *p_next_guid_element;
905
906         if (!p_ftree)
907                 return;
908
909         /* remove all the elements of hca_tbl */
910
911         p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
912         while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
913                 p_hca = p_next_hca;
914                 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
915                 __osm_ftree_hca_destroy(p_hca);
916         }
917         cl_qmap_remove_all(&p_ftree->hca_tbl);
918
919         /* remove all the elements of sw_tbl */
920
921         p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
922         while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
923                 p_sw = p_next_sw;
924                 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
925                 __osm_ftree_sw_destroy(p_ftree, p_sw);
926         }
927         cl_qmap_remove_all(&p_ftree->sw_tbl);
928
929         /* remove all the elements of sw_by_tuple_tbl */
930
931         p_next_element =
932             (ftree_sw_tbl_element_t *) cl_qmap_head(&p_ftree->sw_by_tuple_tbl);
933         while (p_next_element !=
934                (ftree_sw_tbl_element_t *) cl_qmap_end(&p_ftree->
935                                                       sw_by_tuple_tbl)) {
936                 p_element = p_next_element;
937                 p_next_element =
938                     (ftree_sw_tbl_element_t *) cl_qmap_next(&p_element->
939                                                             map_item);
940                 __osm_ftree_sw_tbl_element_destroy(p_element);
941         }
942         cl_qmap_remove_all(&p_ftree->sw_by_tuple_tbl);
943
944         /* remove all the elements of cn_guid_tbl */
945         p_next_guid_element =
946             (name_map_item_t *) cl_qmap_head(&p_ftree->cn_guid_tbl);
947         while (p_next_guid_element !=
948                (name_map_item_t *) cl_qmap_end(&p_ftree->cn_guid_tbl)) {
949                 p_guid_element = p_next_guid_element;
950                 p_next_guid_element =
951                     (name_map_item_t *) cl_qmap_next(&p_guid_element->item);
952                 free(p_guid_element);
953         }
954         cl_qmap_remove_all(&p_ftree->cn_guid_tbl);
955
956         /* remove all the elements of root_guid_list */
957         while (!cl_is_qlist_empty(&p_ftree->root_guid_list))
958                 free(cl_qlist_remove_head(&p_ftree->root_guid_list));
959
960         /* free the leaf switches array */
961         if ((p_ftree->leaf_switches_num > 0) && (p_ftree->leaf_switches))
962                 free(p_ftree->leaf_switches);
963
964         p_ftree->leaf_switches_num = 0;
965         p_ftree->cn_num = 0;
966         p_ftree->leaf_switch_rank = 0;
967         p_ftree->max_switch_rank = 0;
968         p_ftree->max_cn_per_leaf = 0;
969         p_ftree->lft_max_lid_ho = 0;
970         p_ftree->leaf_switches = NULL;
971         p_ftree->fabric_built = FALSE;
972
973 }                               /* __osm_ftree_fabric_destroy() */
974
975 /***************************************************/
976
977 static void __osm_ftree_fabric_destroy(ftree_fabric_t * p_ftree)
978 {
979         if (!p_ftree)
980                 return;
981         __osm_ftree_fabric_clear(p_ftree);
982         free(p_ftree);
983 }
984
985 /***************************************************/
986
987 static uint8_t __osm_ftree_fabric_get_rank(ftree_fabric_t * p_ftree)
988 {
989         return p_ftree->leaf_switch_rank + 1;
990 }
991
992 /***************************************************/
993
994 static void __osm_ftree_fabric_add_hca(ftree_fabric_t * p_ftree,
995                                        osm_node_t * p_osm_node)
996 {
997         ftree_hca_t *p_hca = __osm_ftree_hca_create(p_osm_node);
998
999         CL_ASSERT(osm_node_get_type(p_osm_node) == IB_NODE_TYPE_CA);
1000
1001         cl_qmap_insert(&p_ftree->hca_tbl, p_osm_node->node_info.node_guid,
1002                        &p_hca->map_item);
1003 }
1004
1005 /***************************************************/
1006
1007 static void __osm_ftree_fabric_add_sw(ftree_fabric_t * p_ftree,
1008                                       osm_switch_t * p_osm_sw)
1009 {
1010         ftree_sw_t *p_sw = __osm_ftree_sw_create(p_ftree, p_osm_sw);
1011
1012         CL_ASSERT(osm_node_get_type(p_osm_sw->p_node) == IB_NODE_TYPE_SWITCH);
1013
1014         cl_qmap_insert(&p_ftree->sw_tbl, p_osm_sw->p_node->node_info.node_guid,
1015                        &p_sw->map_item);
1016
1017         /* track the max lid (in host order) that exists in the fabric */
1018         if (cl_ntoh16(p_sw->base_lid) > p_ftree->lft_max_lid_ho)
1019                 p_ftree->lft_max_lid_ho = cl_ntoh16(p_sw->base_lid);
1020 }
1021
1022 /***************************************************/
1023
1024 static void __osm_ftree_fabric_add_sw_by_tuple(IN ftree_fabric_t * p_ftree,
1025                                                IN ftree_sw_t * p_sw)
1026 {
1027         CL_ASSERT(__osm_ftree_tuple_assigned(p_sw->tuple));
1028
1029         cl_qmap_insert(&p_ftree->sw_by_tuple_tbl,
1030                        __osm_ftree_tuple_to_key(p_sw->tuple),
1031                        &__osm_ftree_sw_tbl_element_create(p_sw)->map_item);
1032 }
1033
1034 /***************************************************/
1035
1036 static ftree_sw_t *__osm_ftree_fabric_get_sw_by_tuple(IN ftree_fabric_t *
1037                                                       p_ftree,
1038                                                       IN ftree_tuple_t tuple)
1039 {
1040         ftree_sw_tbl_element_t *p_element;
1041
1042         CL_ASSERT(__osm_ftree_tuple_assigned(tuple));
1043
1044         __osm_ftree_tuple_to_key(tuple);
1045
1046         p_element =
1047             (ftree_sw_tbl_element_t *) cl_qmap_get(&p_ftree->sw_by_tuple_tbl,
1048                                                    __osm_ftree_tuple_to_key
1049                                                    (tuple));
1050         if (p_element ==
1051             (ftree_sw_tbl_element_t *) cl_qmap_end(&p_ftree->sw_by_tuple_tbl))
1052                 return NULL;
1053
1054         return p_element->p_sw;
1055 }
1056
1057 /***************************************************/
1058
1059 static ftree_sw_t *__osm_ftree_fabric_get_sw_by_guid(IN ftree_fabric_t *
1060                                                      p_ftree, IN uint64_t guid)
1061 {
1062         ftree_sw_t *p_sw;
1063         p_sw = (ftree_sw_t *) cl_qmap_get(&p_ftree->sw_tbl, guid);
1064         if (p_sw == (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl))
1065                 return NULL;
1066         return p_sw;
1067 }
1068
1069 /***************************************************/
1070
1071 static ftree_hca_t *__osm_ftree_fabric_get_hca_by_guid(IN ftree_fabric_t *
1072                                                        p_ftree,
1073                                                        IN uint64_t guid)
1074 {
1075         ftree_hca_t *p_hca;
1076         p_hca = (ftree_hca_t *) cl_qmap_get(&p_ftree->hca_tbl, guid);
1077         if (p_hca == (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl))
1078                 return NULL;
1079         return p_hca;
1080 }
1081
1082 /***************************************************/
1083
1084 static void __osm_ftree_fabric_dump(ftree_fabric_t * p_ftree)
1085 {
1086         uint32_t i;
1087         ftree_hca_t *p_hca;
1088         ftree_sw_t *p_sw;
1089
1090         if (!osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
1091                 return;
1092
1093         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n"
1094                 "                       |-------------------------------|\n"
1095                 "                       |-  Full fabric topology dump  -|\n"
1096                 "                       |-------------------------------|\n\n");
1097
1098         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "-- CAs:\n");
1099
1100         for (p_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
1101              p_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl);
1102              p_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item)) {
1103                 __osm_ftree_hca_dump(p_ftree, p_hca);
1104         }
1105
1106         for (i = 0; i < p_ftree->max_switch_rank; i++) {
1107                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1108                         "-- Rank %u switches\n", i);
1109                 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1110                      p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1111                      p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1112                         if (p_sw->rank == i)
1113                                 __osm_ftree_sw_dump(p_ftree, p_sw);
1114                 }
1115         }
1116
1117         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n"
1118                 "                       |---------------------------------------|\n"
1119                 "                       |- Full fabric topology dump completed -|\n"
1120                 "                       |---------------------------------------|\n\n");
1121 }                               /* __osm_ftree_fabric_dump() */
1122
1123 /***************************************************/
1124
1125 static void __osm_ftree_fabric_dump_general_info(IN ftree_fabric_t * p_ftree)
1126 {
1127         uint32_t i, j;
1128         ftree_sw_t *p_sw;
1129
1130         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1131                 "General fabric topology info\n");
1132         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1133                 "============================\n");
1134
1135         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1136                 "  - FatTree rank (roots to leaf switches): %u\n",
1137                 p_ftree->leaf_switch_rank + 1);
1138         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1139                 "  - FatTree max switch rank: %u\n", p_ftree->max_switch_rank);
1140         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1141                 "  - Fabric has %u CAs (%u of them CNs), %u switches\n",
1142                 cl_qmap_count(&p_ftree->hca_tbl), p_ftree->cn_num,
1143                 cl_qmap_count(&p_ftree->sw_tbl));
1144
1145         CL_ASSERT(cl_qmap_count(&p_ftree->hca_tbl) >= p_ftree->cn_num);
1146
1147         for (i = 0; i <= p_ftree->max_switch_rank; i++) {
1148                 j = 0;
1149                 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1150                      p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1151                      p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1152                         if (p_sw->rank == i)
1153                                 j++;
1154                 }
1155                 if (i == 0)
1156                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1157                                 "  - Fabric has %u switches at rank %u (roots)\n",
1158                                 j, i);
1159                 else if (i == p_ftree->leaf_switch_rank)
1160                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1161                                 "  - Fabric has %u switches at rank %u (%u of them leafs)\n",
1162                                 j, i, p_ftree->leaf_switches_num);
1163                 else
1164                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1165                                 "  - Fabric has %u switches at rank %u\n", j,
1166                                 i);
1167         }
1168
1169         if (osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_VERBOSE)) {
1170                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1171                         "  - Root switches:\n");
1172                 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1173                      p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1174                      p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1175                         if (p_sw->rank == 0)
1176                                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1177                                         "      GUID: 0x%016" PRIx64
1178                                         ", LID: %u, Index %s\n",
1179                                         __osm_ftree_sw_get_guid_ho(p_sw),
1180                                         cl_ntoh16(p_sw->base_lid),
1181                                         __osm_ftree_tuple_to_str(p_sw->tuple));
1182                 }
1183
1184                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1185                         "  - Leaf switches (sorted by index):\n");
1186                 for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1187                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1188                                 "      GUID: 0x%016" PRIx64
1189                                 ", LID: %u, Index %s\n",
1190                                 __osm_ftree_sw_get_guid_ho(p_ftree->
1191                                                            leaf_switches[i]),
1192                                 cl_ntoh16(p_ftree->leaf_switches[i]->base_lid),
1193                                 __osm_ftree_tuple_to_str(p_ftree->
1194                                                          leaf_switches[i]->
1195                                                          tuple));
1196                 }
1197         }
1198 }                               /* __osm_ftree_fabric_dump_general_info() */
1199
1200 /***************************************************/
1201
1202 static void __osm_ftree_fabric_dump_hca_ordering(IN ftree_fabric_t * p_ftree)
1203 {
1204         ftree_hca_t *p_hca;
1205         ftree_sw_t *p_sw;
1206         ftree_port_group_t *p_group_on_sw;
1207         ftree_port_group_t *p_group_on_hca;
1208         uint32_t i;
1209         uint32_t j;
1210         unsigned printed_hcas_on_leaf;
1211
1212         char path[1024];
1213         FILE *p_hca_ordering_file;
1214         char *filename = "opensm-ftree-ca-order.dump";
1215
1216         snprintf(path, sizeof(path), "%s/%s",
1217                  p_ftree->p_osm->subn.opt.dump_files_dir, filename);
1218         p_hca_ordering_file = fopen(path, "w");
1219         if (!p_hca_ordering_file) {
1220                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB01: "
1221                         "cannot open file \'%s\': %s\n", filename,
1222                         strerror(errno));
1223                 return;
1224         }
1225
1226         /* for each leaf switch (in indexing order) */
1227         for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1228                 p_sw = p_ftree->leaf_switches[i];
1229                 printed_hcas_on_leaf = 0;
1230
1231                 /* for each real CA (CNs and not) connected to this switch */
1232                 for (j = 0; j < p_sw->down_port_groups_num; j++) {
1233                         p_group_on_sw = p_sw->down_port_groups[j];
1234
1235                         if (p_group_on_sw->remote_node_type != IB_NODE_TYPE_CA)
1236                                 continue;
1237
1238                         p_hca = p_group_on_sw->remote_hca_or_sw.p_hca;
1239                         p_group_on_hca =
1240                             __osm_ftree_hca_get_port_group_by_remote_lid(p_hca,
1241                                                                          p_group_on_sw->
1242                                                                          base_lid);
1243
1244                         /* treat non-compute nodes as dummies */
1245                         if (!p_group_on_hca->is_cn)
1246                                 continue;
1247
1248                         fprintf(p_hca_ordering_file, "0x%04x\t%s\n",
1249                                 cl_ntoh16(p_group_on_hca->base_lid),
1250                                 p_hca->p_osm_node->print_desc);
1251
1252                         printed_hcas_on_leaf++;
1253                 }
1254
1255                 /* now print missing HCAs */
1256                 for (j = 0;
1257                      j < (p_ftree->max_cn_per_leaf - printed_hcas_on_leaf); j++)
1258                         fprintf(p_hca_ordering_file, "0xFFFF\tDUMMY\n");
1259
1260         }
1261         /* done going through all the leaf switches */
1262
1263         fclose(p_hca_ordering_file);
1264 }                               /* __osm_ftree_fabric_dump_hca_ordering() */
1265
1266 /***************************************************/
1267
1268 static void
1269 __osm_ftree_fabric_assign_tuple(IN ftree_fabric_t * p_ftree,
1270                                 IN ftree_sw_t * p_sw,
1271                                 IN ftree_tuple_t new_tuple)
1272 {
1273         memcpy(p_sw->tuple, new_tuple, FTREE_TUPLE_LEN);
1274         __osm_ftree_fabric_add_sw_by_tuple(p_ftree, p_sw);
1275 }
1276
1277 /***************************************************/
1278
1279 static void __osm_ftree_fabric_assign_first_tuple(IN ftree_fabric_t * p_ftree,
1280                                                   IN ftree_sw_t * p_sw)
1281 {
1282         uint8_t i;
1283         ftree_tuple_t new_tuple;
1284
1285         __osm_ftree_tuple_init(new_tuple);
1286         new_tuple[0] = (uint8_t) p_sw->rank;
1287         for (i = 1; i <= p_sw->rank; i++)
1288                 new_tuple[i] = 0;
1289
1290         __osm_ftree_fabric_assign_tuple(p_ftree, p_sw, new_tuple);
1291 }
1292
1293 /***************************************************/
1294
1295 static void
1296 __osm_ftree_fabric_get_new_tuple(IN ftree_fabric_t * p_ftree,
1297                                  OUT ftree_tuple_t new_tuple,
1298                                  IN ftree_tuple_t from_tuple,
1299                                  IN ftree_direction_t direction)
1300 {
1301         ftree_sw_t *p_sw;
1302         ftree_tuple_t temp_tuple;
1303         uint8_t var_index;
1304         uint8_t i;
1305
1306         __osm_ftree_tuple_init(new_tuple);
1307         memcpy(temp_tuple, from_tuple, FTREE_TUPLE_LEN);
1308
1309         if (direction == FTREE_DIRECTION_DOWN) {
1310                 temp_tuple[0]++;
1311                 var_index = from_tuple[0] + 1;
1312         } else {
1313                 temp_tuple[0]--;
1314                 var_index = from_tuple[0];
1315         }
1316
1317         for (i = 0; i < 0xFF; i++) {
1318                 temp_tuple[var_index] = i;
1319                 p_sw = __osm_ftree_fabric_get_sw_by_tuple(p_ftree, temp_tuple);
1320                 if (p_sw == NULL)       /* found free tuple */
1321                         break;
1322         }
1323
1324         if (i == 0xFF) {
1325                 /* new tuple not found - there are more than 255 ports in one direction */
1326                 return;
1327         }
1328         memcpy(new_tuple, temp_tuple, FTREE_TUPLE_LEN);
1329
1330 }                               /* __osm_ftree_fabric_get_new_tuple() */
1331
1332 /***************************************************/
1333
1334 static inline boolean_t __osm_ftree_fabric_roots_provided(IN ftree_fabric_t *
1335                                                           p_ftree)
1336 {
1337         return (p_ftree->p_osm->subn.opt.root_guid_file != NULL);
1338 }
1339
1340 /***************************************************/
1341
1342 static inline boolean_t __osm_ftree_fabric_cns_provided(IN ftree_fabric_t *
1343                                                         p_ftree)
1344 {
1345         return (p_ftree->p_osm->subn.opt.cn_guid_file != NULL);
1346 }
1347
1348 /***************************************************/
1349
1350 static int __osm_ftree_fabric_mark_leaf_switches(IN ftree_fabric_t * p_ftree)
1351 {
1352         ftree_sw_t *p_sw;
1353         ftree_hca_t *p_hca;
1354         ftree_hca_t *p_next_hca;
1355         unsigned i;
1356         int res = 0;
1357
1358         OSM_LOG_ENTER(&p_ftree->p_osm->log);
1359
1360         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1361                 "Marking leaf switches in fabric\n");
1362
1363         /* Scan all the CAs, if they have CNs - find CN port and mark switch
1364            that is connected to this port as leaf switch.
1365            Also, ensure that this marked leaf has rank of p_ftree->leaf_switch_rank. */
1366         p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
1367         while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
1368                 p_hca = p_next_hca;
1369                 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
1370                 if (!p_hca->cn_num)
1371                         continue;
1372
1373                 for (i = 0; i < p_hca->up_port_groups_num; i++) {
1374                         if (!p_hca->up_port_groups[i]->is_cn)
1375                                 continue;
1376
1377                         /* In CAs, port group alway has one port, and since this
1378                            port group is CN, we know that this port is compute node */
1379                         CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type ==
1380                                   IB_NODE_TYPE_SWITCH);
1381                         p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw;
1382
1383                         /* check if this switch was already processed */
1384                         if (p_sw->is_leaf)
1385                                 continue;
1386                         p_sw->is_leaf = TRUE;
1387
1388                         /* ensure that this leaf switch is at the correct tree level */
1389                         if (p_sw->rank != p_ftree->leaf_switch_rank) {
1390                                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1391                                         "ERR AB26: CN port 0x%" PRIx64
1392                                         " is connected to switch 0x%" PRIx64
1393                                         " with rank %u, "
1394                                         "while FatTree leaf rank is %u\n",
1395                                         cl_ntoh64(p_hca->up_port_groups[i]->
1396                                                   port_guid),
1397                                         __osm_ftree_sw_get_guid_ho(p_sw),
1398                                         p_sw->rank, p_ftree->leaf_switch_rank);
1399                                 res = -1;
1400                                 goto Exit;
1401
1402                         }
1403                 }
1404         }
1405
1406 Exit:
1407         OSM_LOG_EXIT(&p_ftree->p_osm->log);
1408         return res;
1409 }                               /* __osm_ftree_fabric_mark_leaf_switches() */
1410
1411 /***************************************************/
1412
1413 static void __osm_ftree_fabric_make_indexing(IN ftree_fabric_t * p_ftree)
1414 {
1415         ftree_sw_t *p_remote_sw;
1416         ftree_sw_t *p_sw = NULL;
1417         ftree_sw_t *p_next_sw;
1418         ftree_tuple_t new_tuple;
1419         uint32_t i;
1420         cl_list_t bfs_list;
1421         ftree_sw_tbl_element_t *p_sw_tbl_element;
1422
1423         OSM_LOG_ENTER(&p_ftree->p_osm->log);
1424
1425         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1426                 "Starting FatTree indexing\n");
1427
1428         /* using the first leaf switch as a starting point for indexing algorithm. */
1429         p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1430         while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1431                 p_sw = p_next_sw;
1432                 if (p_sw->is_leaf)
1433                         break;
1434                 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1435         }
1436
1437         CL_ASSERT(p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl));
1438
1439         /* Assign the first tuple to the switch that is used as BFS starting point.
1440            The tuple will be as follows: [rank].0.0.0...
1441            This fuction also adds the switch it into the switch_by_tuple table. */
1442         __osm_ftree_fabric_assign_first_tuple(p_ftree, p_sw);
1443
1444         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1445                 "Indexing starting point:\n"
1446                 "                                            - Switch rank  : %u\n"
1447                 "                                            - Switch index : %s\n"
1448                 "                                            - Node LID     : %u\n"
1449                 "                                            - Node GUID    : 0x%016"
1450                 PRIx64 "\n", p_sw->rank, __osm_ftree_tuple_to_str(p_sw->tuple),
1451                 cl_ntoh16(p_sw->base_lid), __osm_ftree_sw_get_guid_ho(p_sw));
1452
1453         /*
1454          * Now run BFS and assign indexes to all switches
1455          * Pseudo code of the algorithm is as follows:
1456          *
1457          *  * Add first switch to BFS queue
1458          *  * While (BFS queue not empty)
1459          *      - Pop the switch from the head of the queue
1460          *      - Scan all the downward and upward ports
1461          *      - For each port
1462          *          + Get the remote switch
1463          *          + Assign index to the remote switch
1464          *          + Add remote switch to the BFS queue
1465          */
1466
1467         cl_list_init(&bfs_list, cl_qmap_count(&p_ftree->sw_tbl));
1468         cl_list_insert_tail(&bfs_list,
1469                             &__osm_ftree_sw_tbl_element_create(p_sw)->map_item);
1470
1471         while (!cl_is_list_empty(&bfs_list)) {
1472                 p_sw_tbl_element =
1473                     (ftree_sw_tbl_element_t *) cl_list_remove_head(&bfs_list);
1474                 p_sw = p_sw_tbl_element->p_sw;
1475                 __osm_ftree_sw_tbl_element_destroy(p_sw_tbl_element);
1476
1477                 /* Discover all the nodes from ports that are pointing down */
1478
1479                 if (p_sw->rank >= p_ftree->leaf_switch_rank) {
1480                         /* whether downward ports are pointing to CAs or switches,
1481                            we don't assign indexes to switches that are located
1482                            lower than leaf switches */
1483                 } else {
1484                         /* This is not the leaf switch */
1485                         for (i = 0; i < p_sw->down_port_groups_num; i++) {
1486                                 /* Work with port groups that are pointing to switches only.
1487                                    No need to assign indexing to HCAs */
1488                                 if (p_sw->down_port_groups[i]->
1489                                     remote_node_type != IB_NODE_TYPE_SWITCH)
1490                                         continue;
1491
1492                                 p_remote_sw =
1493                                     p_sw->down_port_groups[i]->remote_hca_or_sw.
1494                                     p_sw;
1495                                 if (__osm_ftree_tuple_assigned
1496                                     (p_remote_sw->tuple)) {
1497                                         /* this switch has been already indexed */
1498                                         continue;
1499                                 }
1500                                 /* allocate new tuple */
1501                                 __osm_ftree_fabric_get_new_tuple(p_ftree,
1502                                                                  new_tuple,
1503                                                                  p_sw->tuple,
1504                                                                  FTREE_DIRECTION_DOWN);
1505                                 /* Assign the new tuple to the remote switch.
1506                                    This fuction also adds the switch into the switch_by_tuple table. */
1507                                 __osm_ftree_fabric_assign_tuple(p_ftree,
1508                                                                 p_remote_sw,
1509                                                                 new_tuple);
1510
1511                                 /* add the newly discovered switch to the BFS queue */
1512                                 cl_list_insert_tail(&bfs_list,
1513                                                     &__osm_ftree_sw_tbl_element_create
1514                                                     (p_remote_sw)->map_item);
1515                         }
1516                         /* Done assigning indexes to all the remote switches
1517                            that are pointed by the downgoing ports.
1518                            Now sort port groups according to remote index. */
1519                         qsort(p_sw->down_port_groups,   /* array */
1520                               p_sw->down_port_groups_num,       /* number of elements */
1521                               sizeof(ftree_port_group_t *),     /* size of each element */
1522                               __osm_ftree_compare_port_groups_by_remote_switch_index);  /* comparator */
1523                 }
1524
1525                 /* Done indexing switches from ports that go down.
1526                    Now do the same with ports that are pointing up. */
1527
1528                 if (p_sw->rank != 0) {
1529                         /* This is not the root switch, which means that all the ports
1530                            that are pointing up are taking us to another switches. */
1531                         for (i = 0; i < p_sw->up_port_groups_num; i++) {
1532                                 p_remote_sw =
1533                                     p_sw->up_port_groups[i]->remote_hca_or_sw.
1534                                     p_sw;
1535                                 if (__osm_ftree_tuple_assigned
1536                                     (p_remote_sw->tuple))
1537                                         continue;
1538                                 /* allocate new tuple */
1539                                 __osm_ftree_fabric_get_new_tuple(p_ftree,
1540                                                                  new_tuple,
1541                                                                  p_sw->tuple,
1542                                                                  FTREE_DIRECTION_UP);
1543                                 /* Assign the new tuple to the remote switch.
1544                                    This fuction also adds the switch to the
1545                                    switch_by_tuple table. */
1546                                 __osm_ftree_fabric_assign_tuple(p_ftree,
1547                                                                 p_remote_sw,
1548                                                                 new_tuple);
1549                                 /* add the newly discovered switch to the BFS queue */
1550                                 cl_list_insert_tail(&bfs_list,
1551                                                     &__osm_ftree_sw_tbl_element_create
1552                                                     (p_remote_sw)->map_item);
1553                         }
1554                         /* Done assigning indexes to all the remote switches
1555                            that are pointed by the upgoing ports.
1556                            Now sort port groups according to remote index. */
1557                         qsort(p_sw->up_port_groups,     /* array */
1558                               p_sw->up_port_groups_num, /* number of elements */
1559                               sizeof(ftree_port_group_t *),     /* size of each element */
1560                               __osm_ftree_compare_port_groups_by_remote_switch_index);  /* comparator */
1561                 }
1562                 /* Done assigning indexes to all the switches that are directly connected
1563                    to the current switch - go to the next switch in the BFS queue */
1564         }
1565         cl_list_destroy(&bfs_list);
1566
1567         OSM_LOG_EXIT(&p_ftree->p_osm->log);
1568 }                               /* __osm_ftree_fabric_make_indexing() */
1569
1570 /***************************************************/
1571
1572 static int __osm_ftree_fabric_create_leaf_switch_array(IN ftree_fabric_t *
1573                                                        p_ftree)
1574 {
1575         ftree_sw_t *p_sw;
1576         ftree_sw_t *p_next_sw;
1577         ftree_sw_t **all_switches_at_leaf_level;
1578         unsigned i;
1579         unsigned all_leaf_idx = 0;
1580         unsigned first_leaf_idx;
1581         unsigned last_leaf_idx;
1582         int res = 0;
1583
1584         OSM_LOG_ENTER(&p_ftree->p_osm->log);
1585
1586         /* create array of ALL the switches that have leaf rank */
1587         all_switches_at_leaf_level = (ftree_sw_t **)
1588             malloc(cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *));
1589         if (!all_switches_at_leaf_level) {
1590                 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
1591                         "Fat-tree routing: Memory allocation failed\n");
1592                 res = -1;
1593                 goto Exit;
1594         }
1595         memset(all_switches_at_leaf_level, 0,
1596                cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *));
1597
1598         p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1599         while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1600                 p_sw = p_next_sw;
1601                 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1602                 if (p_sw->rank == p_ftree->leaf_switch_rank) {
1603                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1604                                 "Adding switch 0x%" PRIx64
1605                                 " to full leaf switch array\n",
1606                                 __osm_ftree_sw_get_guid_ho(p_sw));
1607                         all_switches_at_leaf_level[all_leaf_idx++] = p_sw;
1608
1609                 }
1610         }
1611
1612         /* quick-sort array of leaf switches by index */
1613         qsort(all_switches_at_leaf_level,       /* array */
1614               all_leaf_idx,     /* number of elements */
1615               sizeof(ftree_sw_t *),     /* size of each element */
1616               __osm_ftree_compare_switches_by_index);   /* comparator */
1617
1618         /* check the first and the last REAL leaf (the one
1619            that has CNs) in the array of all the leafs */
1620
1621         first_leaf_idx = all_leaf_idx;
1622         last_leaf_idx = 0;
1623         for (i = 0; i < all_leaf_idx; i++) {
1624                 if (all_switches_at_leaf_level[i]->is_leaf) {
1625                         if (i < first_leaf_idx)
1626                                 first_leaf_idx = i;
1627                         last_leaf_idx = i;
1628                 }
1629         }
1630         CL_ASSERT(first_leaf_idx < last_leaf_idx);
1631
1632         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1633                 "Full leaf array info: first_leaf_idx = %u, last_leaf_idx = %u\n",
1634                 first_leaf_idx, last_leaf_idx);
1635
1636         /* Create array of REAL leaf switches, sorted by index.
1637            This array may contain switches at the same rank w/o CNs,
1638            in case this is the order of indexing. */
1639         p_ftree->leaf_switches_num = last_leaf_idx - first_leaf_idx + 1;
1640         p_ftree->leaf_switches = (ftree_sw_t **)
1641             malloc(p_ftree->leaf_switches_num * sizeof(ftree_sw_t *));
1642         if (!p_ftree->leaf_switches) {
1643                 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
1644                         "Fat-tree routing: Memory allocation failed\n");
1645                 res = -1;
1646                 goto Exit;
1647         }
1648
1649         memcpy(p_ftree->leaf_switches,
1650                &(all_switches_at_leaf_level[first_leaf_idx]),
1651                p_ftree->leaf_switches_num * sizeof(ftree_sw_t *));
1652
1653         free(all_switches_at_leaf_level);
1654
1655         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1656                 "Created array of %u leaf switches\n",
1657                 p_ftree->leaf_switches_num);
1658
1659 Exit:
1660         OSM_LOG_EXIT(&p_ftree->p_osm->log);
1661         return res;
1662 }                               /* __osm_ftree_fabric_create_leaf_switch_array() */
1663
1664 /***************************************************/
1665
1666 static void __osm_ftree_fabric_set_max_cn_per_leaf(IN ftree_fabric_t * p_ftree)
1667 {
1668         unsigned i;
1669         unsigned j;
1670         unsigned cns_on_this_leaf;
1671         ftree_sw_t *p_sw;
1672         ftree_port_group_t *p_group;
1673
1674         for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1675                 p_sw = p_ftree->leaf_switches[i];
1676                 cns_on_this_leaf = 0;
1677                 for (j = 0; j < p_sw->down_port_groups_num; j++) {
1678                         p_group = p_sw->down_port_groups[j];
1679                         if (p_group->remote_node_type != IB_NODE_TYPE_CA)
1680                                 continue;
1681                         cns_on_this_leaf +=
1682                             p_group->remote_hca_or_sw.p_hca->cn_num;
1683                 }
1684                 if (cns_on_this_leaf > p_ftree->max_cn_per_leaf)
1685                         p_ftree->max_cn_per_leaf = cns_on_this_leaf;
1686         }
1687 }                               /* __osm_ftree_fabric_set_max_cn_per_leaf() */
1688
1689 /***************************************************/
1690
1691 static boolean_t __osm_ftree_fabric_validate_topology(IN ftree_fabric_t *
1692                                                       p_ftree)
1693 {
1694         ftree_port_group_t *p_group;
1695         ftree_port_group_t *p_ref_group;
1696         ftree_sw_t *p_sw;
1697         ftree_sw_t *p_next_sw;
1698         ftree_sw_t **reference_sw_arr;
1699         uint16_t tree_rank = __osm_ftree_fabric_get_rank(p_ftree);
1700         boolean_t res = TRUE;
1701         uint8_t i;
1702
1703         OSM_LOG_ENTER(&p_ftree->p_osm->log);
1704
1705         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1706                 "Validating fabric topology\n");
1707
1708         reference_sw_arr =
1709             (ftree_sw_t **) malloc(tree_rank * sizeof(ftree_sw_t *));
1710         if (reference_sw_arr == NULL) {
1711                 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
1712                         "Fat-tree routing: Memory allocation failed\n");
1713                 return FALSE;
1714         }
1715         memset(reference_sw_arr, 0, tree_rank * sizeof(ftree_sw_t *));
1716
1717         p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1718         while (res && p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1719                 p_sw = p_next_sw;
1720                 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1721
1722                 if (!reference_sw_arr[p_sw->rank]) {
1723                         /* This is the first switch in the current level that
1724                            we're checking - use it as a reference */
1725                         reference_sw_arr[p_sw->rank] = p_sw;
1726                 } else {
1727                         /* compare this switch properties to the reference switch */
1728
1729                         if (reference_sw_arr[p_sw->rank]->up_port_groups_num !=
1730                             p_sw->up_port_groups_num) {
1731                                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1732                                         "ERR AB09: Different number of upward port groups on switches:\n"
1733                                         "       GUID 0x%016" PRIx64
1734                                         ", LID %u, Index %s - %u groups\n"
1735                                         "       GUID 0x%016" PRIx64
1736                                         ", LID %u, Index %s - %u groups\n",
1737                                         __osm_ftree_sw_get_guid_ho
1738                                         (reference_sw_arr[p_sw->rank]),
1739                                         cl_ntoh16(reference_sw_arr[p_sw->rank]->
1740                                                   base_lid),
1741                                         __osm_ftree_tuple_to_str
1742                                         (reference_sw_arr[p_sw->rank]->tuple),
1743                                         reference_sw_arr[p_sw->rank]->
1744                                         up_port_groups_num,
1745                                         __osm_ftree_sw_get_guid_ho(p_sw),
1746                                         cl_ntoh16(p_sw->base_lid),
1747                                         __osm_ftree_tuple_to_str(p_sw->tuple),
1748                                         p_sw->up_port_groups_num);
1749                                 res = FALSE;
1750                                 break;
1751                         }
1752
1753                         if (p_sw->rank != (tree_rank - 1) &&
1754                             reference_sw_arr[p_sw->rank]->
1755                             down_port_groups_num !=
1756                             p_sw->down_port_groups_num) {
1757                                 /* we're allowing some hca's to be missing */
1758                                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1759                                         "ERR AB0A: Different number of downward port groups on switches:\n"
1760                                         "       GUID 0x%016" PRIx64
1761                                         ", LID %u, Index %s - %u port groups\n"
1762                                         "       GUID 0x%016" PRIx64
1763                                         ", LID %u, Index %s - %u port groups\n",
1764                                         __osm_ftree_sw_get_guid_ho
1765                                         (reference_sw_arr[p_sw->rank]),
1766                                         cl_ntoh16(reference_sw_arr[p_sw->rank]->
1767                                                   base_lid),
1768                                         __osm_ftree_tuple_to_str
1769                                         (reference_sw_arr[p_sw->rank]->tuple),
1770                                         reference_sw_arr[p_sw->rank]->
1771                                         down_port_groups_num,
1772                                         __osm_ftree_sw_get_guid_ho(p_sw),
1773                                         cl_ntoh16(p_sw->base_lid),
1774                                         __osm_ftree_tuple_to_str(p_sw->tuple),
1775                                         p_sw->down_port_groups_num);
1776                                 res = FALSE;
1777                                 break;
1778                         }
1779
1780                         if (reference_sw_arr[p_sw->rank]->up_port_groups_num !=
1781                             0) {
1782                                 p_ref_group =
1783                                     reference_sw_arr[p_sw->rank]->
1784                                     up_port_groups[0];
1785                                 for (i = 0; i < p_sw->up_port_groups_num; i++) {
1786                                         p_group = p_sw->up_port_groups[i];
1787                                         if (cl_ptr_vector_get_size
1788                                             (&p_ref_group->ports) !=
1789                                             cl_ptr_vector_get_size(&p_group->
1790                                                                    ports)) {
1791                                                 OSM_LOG(&p_ftree->p_osm->log,
1792                                                         OSM_LOG_ERROR,
1793                                                         "ERR AB0B: Different number of ports in an upward port group on switches:\n"
1794                                                         "       GUID 0x%016"
1795                                                         PRIx64
1796                                                         ", LID %u, Index %s - %u ports\n"
1797                                                         "       GUID 0x%016"
1798                                                         PRIx64
1799                                                         ", LID %u, Index %s - %u ports\n",
1800                                                         __osm_ftree_sw_get_guid_ho
1801                                                         (reference_sw_arr
1802                                                          [p_sw->rank]),
1803                                                         cl_ntoh16
1804                                                         (reference_sw_arr
1805                                                          [p_sw->rank]->
1806                                                          base_lid),
1807                                                         __osm_ftree_tuple_to_str
1808                                                         (reference_sw_arr
1809                                                          [p_sw->rank]->tuple),
1810                                                         cl_ptr_vector_get_size
1811                                                         (&p_ref_group->ports),
1812                                                         __osm_ftree_sw_get_guid_ho
1813                                                         (p_sw),
1814                                                         cl_ntoh16(p_sw->
1815                                                                   base_lid),
1816                                                         __osm_ftree_tuple_to_str
1817                                                         (p_sw->tuple),
1818                                                         cl_ptr_vector_get_size
1819                                                         (&p_group->ports));
1820                                                 res = FALSE;
1821                                                 break;
1822                                         }
1823                                 }
1824                         }
1825                         if (reference_sw_arr[p_sw->rank]->
1826                             down_port_groups_num != 0
1827                             && p_sw->rank != (tree_rank - 1)) {
1828                                 /* we're allowing some hca's to be missing */
1829                                 p_ref_group =
1830                                     reference_sw_arr[p_sw->rank]->
1831                                     down_port_groups[0];
1832                                 for (i = 0; i < p_sw->down_port_groups_num; i++) {
1833                                         p_group = p_sw->down_port_groups[0];
1834                                         if (cl_ptr_vector_get_size
1835                                             (&p_ref_group->ports) !=
1836                                             cl_ptr_vector_get_size(&p_group->
1837                                                                    ports)) {
1838                                                 OSM_LOG(&p_ftree->p_osm->log,
1839                                                         OSM_LOG_ERROR,
1840                                                         "ERR AB0C: Different number of ports in an downward port group on switches:\n"
1841                                                         "       GUID 0x%016"
1842                                                         PRIx64
1843                                                         ", LID %u, Index %s - %u ports\n"
1844                                                         "       GUID 0x%016"
1845                                                         PRIx64
1846                                                         ", LID %u, Index %s - %u ports\n",
1847                                                         __osm_ftree_sw_get_guid_ho
1848                                                         (reference_sw_arr
1849                                                          [p_sw->rank]),
1850                                                         cl_ntoh16
1851                                                         (reference_sw_arr
1852                                                          [p_sw->rank]->
1853                                                          base_lid),
1854                                                         __osm_ftree_tuple_to_str
1855                                                         (reference_sw_arr
1856                                                          [p_sw->rank]->tuple),
1857                                                         cl_ptr_vector_get_size
1858                                                         (&p_ref_group->ports),
1859                                                         __osm_ftree_sw_get_guid_ho
1860                                                         (p_sw),
1861                                                         cl_ntoh16(p_sw->
1862                                                                   base_lid),
1863                                                         __osm_ftree_tuple_to_str
1864                                                         (p_sw->tuple),
1865                                                         cl_ptr_vector_get_size
1866                                                         (&p_group->ports));
1867                                                 res = FALSE;
1868                                                 break;
1869                                         }
1870                                 }
1871                         }
1872                 }               /* end of else */
1873         }                       /* end of while */
1874
1875         if (res == TRUE)
1876                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1877                         "Fabric topology has been identified as FatTree\n");
1878         else
1879                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1880                         "ERR AB0D: Fabric topology hasn't been identified as FatTree\n");
1881
1882         free(reference_sw_arr);
1883         OSM_LOG_EXIT(&p_ftree->p_osm->log);
1884         return res;
1885 }                               /* __osm_ftree_fabric_validate_topology() */
1886
1887 /***************************************************
1888  ***************************************************/
1889
1890 static void __osm_ftree_set_sw_fwd_table(IN cl_map_item_t * const p_map_item,
1891                                          IN void *context)
1892 {
1893         ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item;
1894         ftree_fabric_t *p_ftree = (ftree_fabric_t *) context;
1895
1896         p_sw->p_osm_sw->max_lid_ho = p_ftree->lft_max_lid_ho;
1897         osm_ucast_mgr_set_fwd_table(&p_ftree->p_osm->sm.ucast_mgr,
1898                                     p_sw->p_osm_sw);
1899 }
1900
1901 /***************************************************
1902  ***************************************************/
1903
1904 /*
1905  * Function: assign-up-going-port-by-descending-down
1906  * Given   : a switch and a LID
1907  * Pseudo code:
1908  *    foreach down-going-port-group (in indexing order)
1909  *        skip this group if the LFT(LID) port is part of this group
1910  *        find the least loaded port of the group (scan in indexing order)
1911  *        r-port is the remote port connected to it
1912  *        assign the remote switch node LFT(LID) to r-port
1913  *        increase r-port usage counter
1914  *        assign-up-going-port-by-descending-down to r-port node (recursion)
1915  */
1916
1917 static void
1918 __osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
1919                                                IN ftree_sw_t * p_sw,
1920                                                IN ftree_sw_t * p_prev_sw,
1921                                                IN ib_net16_t target_lid,
1922                                                IN uint8_t target_rank,
1923                                                IN boolean_t is_real_lid,
1924                                                IN boolean_t is_main_path,
1925                                                IN uint8_t highest_rank_in_route)
1926 {
1927         ftree_sw_t *p_remote_sw;
1928         uint16_t ports_num;
1929         ftree_port_group_t *p_group;
1930         ftree_port_t *p_port;
1931         ftree_port_t *p_min_port;
1932         uint16_t i;
1933         uint16_t j;
1934         uint16_t k;
1935
1936         /* we shouldn't enter here if both real_lid and main_path are false */
1937         CL_ASSERT(is_real_lid || is_main_path);
1938
1939         /* if there is no down-going ports */
1940         if (p_sw->down_port_groups_num == 0)
1941                 return;
1942
1943         /* promote the index that indicates which group should we
1944            start with when going through all the downgoing groups */
1945         p_sw->down_port_groups_idx =
1946                 (p_sw->down_port_groups_idx + 1) % p_sw->down_port_groups_num;
1947
1948         /* foreach down-going port group (in indexing order) */
1949         i = p_sw->down_port_groups_idx;
1950         for (k = 0; k < p_sw->down_port_groups_num; k++) {
1951
1952                 p_group = p_sw->down_port_groups[i];
1953                 i = (i + 1) % p_sw->down_port_groups_num;
1954
1955                 /* Skip this port group unless it points to a switch */
1956                 if (p_group->remote_node_type != IB_NODE_TYPE_SWITCH)
1957                         continue;
1958
1959                 if (p_prev_sw
1960                     && (p_group->remote_base_lid == p_prev_sw->base_lid)) {
1961                         /* This port group has a port that was used when we entered this switch,
1962                            which means that the current group points to the switch where we were
1963                            at the previous step of the algorithm (before going up).
1964                            Skipping this group. */
1965                         continue;
1966                 }
1967
1968                 /* find the least loaded port of the group (in indexing order) */
1969                 p_min_port = NULL;
1970                 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
1971                 /* ToDo: no need to select a least loaded port for non-main path.
1972                    Think about optimization. */
1973                 for (j = 0; j < ports_num; j++) {
1974                         cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
1975                         if (!p_min_port) {
1976                                 /* first port that we're checking - set as port with the lowest load */
1977                                 p_min_port = p_port;
1978                         } else if (p_port->counter_up < p_min_port->counter_up) {
1979                                 /* this port is less loaded - use it as min */
1980                                 p_min_port = p_port;
1981                         }
1982                 }
1983                 /* At this point we have selected a port in this group with the
1984                    lowest load of upgoing routes.
1985                    Set on the remote switch how to get to the target_lid -
1986                    set LFT(target_lid) on the remote switch to the remote port */
1987                 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
1988
1989                 if (osm_switch_get_least_hops(p_remote_sw->p_osm_sw,
1990                                               cl_ntoh16(target_lid)) !=
1991                     OSM_NO_PATH) {
1992                         /* Loop in the fabric - we already routed the remote switch
1993                            on our way UP, and now we see it again on our way DOWN */
1994                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1995                                 "Loop of lenght %d in the fabric:\n                             "
1996                                 "Switch %s (LID %u) closes loop through switch %s (LID %u)\n",
1997                                 (p_remote_sw->rank - highest_rank_in_route) * 2,
1998                                 __osm_ftree_tuple_to_str(p_remote_sw->tuple),
1999                                 cl_ntoh16(p_group->base_lid),
2000                                 __osm_ftree_tuple_to_str(p_sw->tuple),
2001                                 cl_ntoh16(p_group->remote_base_lid));
2002                         continue;
2003                 }
2004
2005                 /* Four possible cases:
2006                  *
2007                  *  1. is_real_lid == TRUE && is_main_path == TRUE:
2008                  *      - going DOWN(TRUE,TRUE) through ALL the groups
2009                  *         + promoting port counter
2010                  *         + setting path in remote switch fwd tbl
2011                  *         + setting hops in remote switch on all the ports of each group
2012                  *
2013                  *  2. is_real_lid == TRUE && is_main_path == FALSE:
2014                  *      - going DOWN(TRUE,FALSE) through ALL the groups but only if
2015                  *        the remote (lower) switch hasn't been already configured
2016                  *        for this target LID
2017                  *         + NOT promoting port counter
2018                  *         + setting path in remote switch fwd tbl if it hasn't been set yet
2019                  *         + setting hops in remote switch on all the ports of each group
2020                  *           if it hasn't been set yet
2021                  *
2022                  *  3. is_real_lid == FALSE && is_main_path == TRUE:
2023                  *      - going DOWN(FALSE,TRUE) through ALL the groups
2024                  *         + promoting port counter
2025                  *         + NOT setting path in remote switch fwd tbl
2026                  *         + NOT setting hops in remote switch
2027                  *
2028                  *  4. is_real_lid == FALSE && is_main_path == FALSE:
2029                  *      - illegal state - we shouldn't get here
2030                  */
2031
2032                 /* second case: skip the port group if the remote (lower)
2033                    switch has been already configured for this target LID */
2034                 if (is_real_lid && !is_main_path &&
2035                     p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] != OSM_NO_PATH)
2036                         continue;
2037
2038                 /* setting fwd tbl port only if this is real LID */
2039                 if (is_real_lid) {
2040                         p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] =
2041                                 p_min_port->remote_port_num;
2042                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2043                                 "Switch %s: set path to CA LID %u through port %u\n",
2044                                 __osm_ftree_tuple_to_str(p_remote_sw->tuple),
2045                                 cl_ntoh16(target_lid),
2046                                 p_min_port->remote_port_num);
2047
2048                         /* On the remote switch that is pointed by the p_group,
2049                            set hops for ALL the ports in the remote group. */
2050
2051                         for (j = 0; j < ports_num; j++) {
2052                                 cl_ptr_vector_at(&p_group->ports, j,
2053                                                  (void *)&p_port);
2054
2055                                 __osm_ftree_sw_set_hops(p_remote_sw,
2056                                                         cl_ntoh16(target_lid),
2057                                                         p_port->remote_port_num,
2058                                                         ((target_rank -
2059                                                           highest_rank_in_route)
2060                                                          + (p_remote_sw->rank -
2061                                                             highest_rank_in_route)));
2062                         }
2063
2064                 }
2065
2066                 /* The number of upgoing routes is tracked in the
2067                    p_port->counter_up counter of the port that belongs to
2068                    the upper side of the link (on switch with lower rank).
2069                    Counter is promoted only if we're routing LID on the main
2070                    path (whether it's a real LID or a dummy one). */
2071                 if (is_main_path)
2072                         p_min_port->counter_up++;
2073
2074                 /* Recursion step:
2075                    Assign upgoing ports by stepping down, starting on REMOTE switch */
2076                 __osm_ftree_fabric_route_upgoing_by_going_down(p_ftree, p_remote_sw,    /* remote switch - used as a route-upgoing alg. start point */
2077                                                                NULL,    /* prev. position - NULL to mark that we went down and not up */
2078                                                                target_lid,      /* LID that we're routing to */
2079                                                                target_rank,     /* rank of the LID that we're routing to */
2080                                                                is_real_lid,     /* whether the target LID is real or dummy */
2081                                                                is_main_path,    /* whether this is path to HCA that should by tracked by counters */
2082                                                                highest_rank_in_route);  /* highest visited point in the tree before going down */
2083         }
2084         /* done scanning all the down-going port groups */
2085
2086 }                               /* __osm_ftree_fabric_route_upgoing_by_going_down() */
2087
2088 /***************************************************/
2089
2090 /*
2091  * Function: assign-down-going-port-by-ascending-up
2092  * Given   : a switch and a LID
2093  * Pseudo code:
2094  *    find the least loaded port of all the upgoing groups (scan in indexing order)
2095  *    assign the LFT(LID) of remote switch to that port
2096  *    track that port usage
2097  *    assign-up-going-port-by-descending-down on CURRENT switch
2098  *    assign-down-going-port-by-ascending-up on REMOTE switch (recursion)
2099  */
2100
2101 static void
2102 __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
2103                                                IN ftree_sw_t * p_sw,
2104                                                IN ftree_sw_t * p_prev_sw,
2105                                                IN ib_net16_t target_lid,
2106                                                IN uint8_t target_rank,
2107                                                IN boolean_t is_real_lid,
2108                                                IN boolean_t is_main_path)
2109 {
2110         ftree_sw_t *p_remote_sw;
2111         uint16_t ports_num;
2112         ftree_port_group_t *p_group;
2113         ftree_port_t *p_port;
2114         ftree_port_group_t *p_min_group;
2115         ftree_port_t *p_min_port;
2116         uint16_t i;
2117         uint16_t j;
2118
2119         /* we shouldn't enter here if both real_lid and main_path are false */
2120         CL_ASSERT(is_real_lid || is_main_path);
2121
2122         /* Assign upgoing ports by stepping down, starting on THIS switch */
2123         __osm_ftree_fabric_route_upgoing_by_going_down(p_ftree, p_sw,   /* local switch - used as a route-upgoing alg. start point */
2124                                                        p_prev_sw,       /* switch that we went up from (NULL means that we went down) */
2125                                                        target_lid,      /* LID that we're routing to */
2126                                                        target_rank,     /* rank of the LID that we're routing to */
2127                                                        is_real_lid,     /* whether this target LID is real or dummy */
2128                                                        is_main_path,    /* whether this path to HCA should by tracked by counters */
2129                                                        p_sw->rank);     /* the highest visited point in the tree before going down */
2130
2131         /* recursion stop condition - if it's a root switch, */
2132         if (p_sw->rank == 0)
2133                 return;
2134
2135         /* Find the least loaded upgoing port group */
2136         p_min_group = NULL;
2137         for (i = 0; i < p_sw->up_port_groups_num; i++) {
2138                 p_group = p_sw->up_port_groups[i];
2139                 if (!p_min_group) {
2140                         /* first group that we're checking - use
2141                            it as a group with the lowest load */
2142                         p_min_group = p_group;
2143                 } else if (p_group->counter_down < p_min_group->counter_down) {
2144                         /* this group is less loaded - use it as min */
2145                         p_min_group = p_group;
2146                 }
2147         }
2148
2149         /* Find the least loaded upgoing port in the selected group */
2150         p_min_port = NULL;
2151         ports_num = (uint16_t) cl_ptr_vector_get_size(&p_min_group->ports);
2152         for (j = 0; j < ports_num; j++) {
2153                 cl_ptr_vector_at(&p_min_group->ports, j, (void *)&p_port);
2154                 if (!p_min_port) {
2155                         /* first port that we're checking - use
2156                            it as a port with the lowest load */
2157                         p_min_port = p_port;
2158                 } else if (p_port->counter_down < p_min_port->counter_down) {
2159                         /* this port is less loaded - use it as min */
2160                         p_min_port = p_port;
2161                 }
2162         }
2163
2164         /* At this point we have selected a group and port with the
2165            lowest load of downgoing routes.
2166            Set on the remote switch how to get to the target_lid -
2167            set LFT(target_lid) on the remote switch to the remote port */
2168         p_remote_sw = p_min_group->remote_hca_or_sw.p_sw;
2169
2170         /* Four possible cases:
2171          *
2172          *  1. is_real_lid == TRUE && is_main_path == TRUE:
2173          *      - going UP(TRUE,TRUE) on selected min_group and min_port
2174          *         + promoting port counter
2175          *         + setting path in remote switch fwd tbl
2176          *         + setting hops in remote switch on all the ports of selected group
2177          *      - going UP(TRUE,FALSE) on rest of the groups, each time on port 0
2178          *         + NOT promoting port counter
2179          *         + setting path in remote switch fwd tbl if it hasn't been set yet
2180          *         + setting hops in remote switch on all the ports of each group
2181          *           if it hasn't been set yet
2182          *
2183          *  2. is_real_lid == TRUE && is_main_path == FALSE:
2184          *      - going UP(TRUE,FALSE) on ALL the groups, each time on port 0,
2185          *        but only if the remote (upper) switch hasn't been already
2186          *        configured for this target LID
2187          *         + NOT promoting port counter
2188          *         + setting path in remote switch fwd tbl if it hasn't been set yet
2189          *         + setting hops in remote switch on all the ports of each group
2190          *           if it hasn't been set yet
2191          *
2192          *  3. is_real_lid == FALSE && is_main_path == TRUE:
2193          *      - going UP(FALSE,TRUE) ONLY on selected min_group and min_port
2194          *         + promoting port counter
2195          *         + NOT setting path in remote switch fwd tbl
2196          *         + NOT setting hops in remote switch
2197          *
2198          *  4. is_real_lid == FALSE && is_main_path == FALSE:
2199          *      - illegal state - we shouldn't get here
2200          */
2201
2202         /* covering first half of case 1, and case 3 */
2203         if (is_main_path) {
2204                 if (p_sw->is_leaf) {
2205                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2206                                 " - Routing MAIN path for %s CA LID %u: %s --> %s\n",
2207                                 (is_real_lid) ? "real" : "DUMMY",
2208                                 cl_ntoh16(target_lid),
2209                                 __osm_ftree_tuple_to_str(p_sw->tuple),
2210                                 __osm_ftree_tuple_to_str(p_remote_sw->tuple));
2211                 }
2212                 /* The number of downgoing routes is tracked in the
2213                    p_group->counter_down p_port->counter_down counters of the
2214                    group and port that belong to the lower side of the link
2215                    (on switch with higher rank) */
2216                 p_min_group->counter_down++;
2217                 p_min_port->counter_down++;
2218                 if (is_real_lid) {
2219                         p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] =
2220                                 p_min_port->remote_port_num;
2221                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2222                                 "Switch %s: set path to CA LID %u through port %u\n",
2223                                 __osm_ftree_tuple_to_str(p_remote_sw->tuple),
2224                                 cl_ntoh16(target_lid),
2225                                 p_min_port->remote_port_num);
2226
2227                         /* On the remote switch that is pointed by the min_group,
2228                            set hops for ALL the ports in the remote group. */
2229
2230                         ports_num =
2231                             (uint16_t) cl_ptr_vector_get_size(&p_min_group->
2232                                                               ports);
2233                         for (j = 0; j < ports_num; j++) {
2234                                 cl_ptr_vector_at(&p_min_group->ports, j,
2235                                                  (void *)&p_port);
2236                                 __osm_ftree_sw_set_hops(p_remote_sw,
2237                                                         cl_ntoh16(target_lid),
2238                                                         p_port->remote_port_num,
2239                                                         target_rank -
2240                                                         p_remote_sw->rank);
2241                         }
2242                 }
2243
2244                 /* Recursion step:
2245                    Assign downgoing ports by stepping up, starting on REMOTE switch. */
2246                 __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,    /* remote switch - used as a route-downgoing alg. next step point */
2247                                                                p_sw,    /* this switch - prev. position switch for the function */
2248                                                                target_lid,      /* LID that we're routing to */
2249                                                                target_rank,     /* rank of the LID that we're routing to */
2250                                                                is_real_lid,     /* whether this target LID is real or dummy */
2251                                                                is_main_path);   /* whether this is path to HCA that should by tracked by counters */
2252         }
2253
2254         /* we're done for the third case */
2255         if (!is_real_lid)
2256                 return;
2257
2258         /* What's left to do at this point:
2259          *
2260          *  1. is_real_lid == TRUE && is_main_path == TRUE:
2261          *      - going UP(TRUE,FALSE) on rest of the groups, each time on port 0,
2262          *        but only if the remote (upper) switch hasn't been already
2263          *        configured for this target LID
2264          *         + NOT promoting port counter
2265          *         + setting path in remote switch fwd tbl if it hasn't been set yet
2266          *         + setting hops in remote switch on all the ports of each group
2267          *           if it hasn't been set yet
2268          *
2269          *  2. is_real_lid == TRUE && is_main_path == FALSE:
2270          *      - going UP(TRUE,FALSE) on ALL the groups, each time on port 0,
2271          *        but only if the remote (upper) switch hasn't been already
2272          *        configured for this target LID
2273          *         + NOT promoting port counter
2274          *         + setting path in remote switch fwd tbl if it hasn't been set yet
2275          *         + setting hops in remote switch on all the ports of each group
2276          *           if it hasn't been set yet
2277          *
2278          *  These two rules can be rephrased this way:
2279          *   - foreach UP port group
2280          *      + if remote switch has been set with the target LID
2281          *         - skip this port group
2282          *      + else
2283          *         - select port 0
2284          *         - do NOT promote port counter
2285          *         - set path in remote switch fwd tbl
2286          *         - set hops in remote switch on all the ports of this group
2287          *         - go UP(TRUE,FALSE) to the remote switch
2288          */
2289
2290         for (i = 0; i < p_sw->up_port_groups_num; i++) {
2291                 p_group = p_sw->up_port_groups[i];
2292                 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2293
2294                 /* skip if target lid has been already set on remote switch fwd tbl */
2295                 if (p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] != OSM_NO_PATH)
2296                         continue;
2297
2298                 if (p_sw->is_leaf) {
2299                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2300                                 " - Routing SECONDARY path for LID %u: %s --> %s\n",
2301                                 cl_ntoh16(target_lid),
2302                                 __osm_ftree_tuple_to_str(p_sw->tuple),
2303                                 __osm_ftree_tuple_to_str(p_remote_sw->tuple));
2304                 }
2305
2306                 /* Routing REAL lids on SECONDARY path means routing
2307                    switch-to-switch or switch-to-CA paths.
2308                    We can safely assume that switch will initiate very
2309                    few traffic, so there's no point waisting runtime on
2310                    trying to balance these routes - always pick port 0. */
2311
2312                 cl_ptr_vector_at(&p_group->ports, 0, (void *)&p_port);
2313                 p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] =
2314                         p_port->remote_port_num;
2315
2316                 /* On the remote switch that is pointed by the p_group,
2317                    set hops for ALL the ports in the remote group. */
2318
2319                 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
2320                 for (j = 0; j < ports_num; j++) {
2321                         cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
2322
2323                         __osm_ftree_sw_set_hops(p_remote_sw,
2324                                                 cl_ntoh16(target_lid),
2325                                                 p_port->remote_port_num,
2326                                                 target_rank -
2327                                                 p_remote_sw->rank);
2328                 }
2329
2330                 /* Recursion step:
2331                    Assign downgoing ports by stepping up, starting on REMOTE switch. */
2332                 __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,    /* remote switch - used as a route-downgoing alg. next step point */
2333                                                                p_sw,    /* this switch - prev. position switch for the function */
2334                                                                target_lid,      /* LID that we're routing to */
2335                                                                target_rank,     /* rank of the LID that we're routing to */
2336                                                                TRUE,    /* whether the target LID is real or dummy */
2337                                                                FALSE);  /* whether this is path to HCA that should by tracked by counters */
2338         }
2339
2340 }                               /* ftree_fabric_route_downgoing_by_going_up() */
2341
2342 /***************************************************/
2343
2344 /*
2345  * Pseudo code:
2346  *    foreach leaf switch (in indexing order)
2347  *       for each compute node (in indexing order)
2348  *          obtain the LID of the compute node
2349  *          set local LFT(LID) of the port connecting to compute node
2350  *          call assign-down-going-port-by-ascending-up(TRUE,TRUE) on CURRENT switch
2351  *       for each MISSING compute node
2352  *          call assign-down-going-port-by-ascending-up(FALSE,TRUE) on CURRENT switch
2353  */
2354
2355 static void __osm_ftree_fabric_route_to_cns(IN ftree_fabric_t * p_ftree)
2356 {
2357         ftree_sw_t *p_sw;
2358         ftree_hca_t *p_hca;
2359         ftree_port_group_t *p_leaf_port_group;
2360         ftree_port_group_t *p_hca_port_group;
2361         ftree_port_t *p_port;
2362         uint32_t i;
2363         uint32_t j;
2364         ib_net16_t hca_lid;
2365         unsigned routed_targets_on_leaf;
2366
2367         OSM_LOG_ENTER(&p_ftree->p_osm->log);
2368
2369         /* for each leaf switch (in indexing order) */
2370         for (i = 0; i < p_ftree->leaf_switches_num; i++) {
2371                 p_sw = p_ftree->leaf_switches[i];
2372                 routed_targets_on_leaf = 0;
2373
2374                 /* for each HCA connected to this switch */
2375                 for (j = 0; j < p_sw->down_port_groups_num; j++) {
2376                         p_leaf_port_group = p_sw->down_port_groups[j];
2377
2378                         /* work with this port group only if the remote node is CA */
2379                         if (p_leaf_port_group->remote_node_type !=
2380                             IB_NODE_TYPE_CA)
2381                                 continue;
2382
2383                         p_hca = p_leaf_port_group->remote_hca_or_sw.p_hca;
2384
2385                         /* work with this port group only if remote HCA has CNs */
2386                         if (!p_hca->cn_num)
2387                                 continue;
2388
2389                         p_hca_port_group =
2390                             __osm_ftree_hca_get_port_group_by_remote_lid(p_hca,
2391                                                                          p_leaf_port_group->
2392                                                                          base_lid);
2393                         CL_ASSERT(p_hca_port_group);
2394
2395                         /* work with this port group only if remote port is CN */
2396                         if (!p_hca_port_group->is_cn)
2397                                 continue;
2398
2399                         /* obtain the LID of HCA port */
2400                         hca_lid = p_leaf_port_group->remote_base_lid;
2401
2402                         /* set local LFT(LID) to the port that is connected to HCA */
2403                         cl_ptr_vector_at(&p_leaf_port_group->ports, 0,
2404                                          (void *)&p_port);
2405                         p_sw->p_osm_sw->new_lft[cl_ntoh16(hca_lid)] = p_port->port_num;
2406
2407                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2408                                 "Switch %s: set path to CN LID %u through port %u\n",
2409                                 __osm_ftree_tuple_to_str(p_sw->tuple),
2410                                 cl_ntoh16(hca_lid), p_port->port_num);
2411
2412                         /* set local min hop table(LID) to route to the CA */
2413                         __osm_ftree_sw_set_hops(p_sw,
2414                                                 cl_ntoh16(hca_lid),
2415                                                 p_port->port_num, 1);
2416
2417                         /* Assign downgoing ports by stepping up.
2418                            Since we're routing here only CNs, we're routing it as REAL
2419                            LID and updating fat-tree balancing counters. */
2420                         __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_sw,   /* local switch - used as a route-downgoing alg. start point */
2421                                                                        NULL,    /* prev. position switch */
2422                                                                        hca_lid, /* LID that we're routing to */
2423                                                                        p_sw->rank + 1,  /* rank of the LID that we're routing to */
2424                                                                        TRUE,    /* whether this HCA LID is real or dummy */
2425                                                                        TRUE);   /* whether this path to HCA should by tracked by counters */
2426
2427                         /* count how many real targets have been routed from this leaf switch */
2428                         routed_targets_on_leaf++;
2429                 }
2430
2431                 /* We're done with the real targets (all CNs) of this leaf switch.
2432                    Now route the dummy HCAs that are missing or that are non-CNs.
2433                    When routing to dummy HCAs we don't fill lid matrices. */
2434
2435                 if (p_ftree->max_cn_per_leaf > routed_targets_on_leaf) {
2436                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2437                                 "Routing %u dummy CAs\n",
2438                                 p_ftree->max_cn_per_leaf -
2439                                 p_sw->down_port_groups_num);
2440                         for (j = 0;
2441                              ((int)j) <
2442                              (p_ftree->max_cn_per_leaf -
2443                               routed_targets_on_leaf); j++) {
2444                                 /* assign downgoing ports by stepping up */
2445                                 __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_sw,   /* local switch - used as a route-downgoing alg. start point */
2446                                                                                NULL,    /* prev. position switch */
2447                                                                                0,       /* LID that we're routing to - ignored for dummy HCA */
2448                                                                                0,       /* rank of the LID that we're routing to - ignored for dummy HCA */
2449                                                                                FALSE,   /* whether this HCA LID is real or dummy */
2450                                                                                TRUE);   /* whether this path to HCA should by tracked by counters */
2451                         }
2452                 }
2453         }
2454         /* done going through all the leaf switches */
2455         OSM_LOG_EXIT(&p_ftree->p_osm->log);
2456 }                               /* __osm_ftree_fabric_route_to_cns() */
2457
2458 /***************************************************/
2459
2460 /*
2461  * Pseudo code:
2462  *    foreach HCA non-CN port in fabric
2463  *       obtain the LID of the HCA port
2464  *       get switch that is connected to this HCA port
2465  *       set switch LFT(LID) to the port connecting to compute node
2466  *       call assign-down-going-port-by-ascending-up(TRUE,FALSE) on CURRENT switch
2467  *
2468  * Routing to these HCAs is routing a REAL hca lid on SECONDARY path.
2469  * However, we do want to allow load-leveling of the traffic to the non-CNs,
2470  * because such nodes may include IO nodes with heavy usage
2471  *   - we should set fwd tables
2472  *   - we should update port counters
2473  * Routing to non-CNs is done after routing to CNs, so updated port
2474  * counters will not affect CN-to-CN routing.
2475  */
2476
2477 static void __osm_ftree_fabric_route_to_non_cns(IN ftree_fabric_t * p_ftree)
2478 {
2479         ftree_sw_t *p_sw;
2480         ftree_hca_t *p_hca;
2481         ftree_hca_t *p_next_hca;
2482         ftree_port_t *p_hca_port;
2483         ftree_port_group_t *p_hca_port_group;
2484         ib_net16_t hca_lid;
2485         unsigned port_num_on_switch;
2486         unsigned i;
2487
2488         OSM_LOG_ENTER(&p_ftree->p_osm->log);
2489
2490         p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
2491         while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
2492                 p_hca = p_next_hca;
2493                 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
2494
2495                 for (i = 0; i < p_hca->up_port_groups_num; i++) {
2496                         p_hca_port_group = p_hca->up_port_groups[i];
2497
2498                         /* skip this port if it's CN, in which case it has been already routed */
2499                         if (p_hca_port_group->is_cn)
2500                                 continue;
2501
2502                         /* skip this port if it is not connected to switch */
2503                         if (p_hca_port_group->remote_node_type !=
2504                             IB_NODE_TYPE_SWITCH)
2505                                 continue;
2506
2507                         p_sw = p_hca_port_group->remote_hca_or_sw.p_sw;
2508                         hca_lid = p_hca_port_group->base_lid;
2509
2510                         /* set switches  LFT(LID) to the port that is connected to HCA */
2511                         cl_ptr_vector_at(&p_hca_port_group->ports, 0,
2512                                          (void *)&p_hca_port);
2513                         port_num_on_switch = p_hca_port->remote_port_num;
2514                         p_sw->p_osm_sw->new_lft[cl_ntoh16(hca_lid)] = port_num_on_switch;
2515
2516                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2517                                 "Switch %s: set path to non-CN HCA LID %u through port %u\n",
2518                                 __osm_ftree_tuple_to_str(p_sw->tuple),
2519                                 cl_ntoh16(hca_lid), port_num_on_switch);
2520
2521                         /* set local min hop table(LID) to route to the CA */
2522                         __osm_ftree_sw_set_hops(p_sw, cl_ntoh16(hca_lid),
2523                                                 port_num_on_switch,     /* port num */
2524                                                 1);     /* hops */
2525
2526                         /* Assign downgoing ports by stepping up.
2527                            We're routing REAL targets. They are not CNs and not included
2528                            in the leafs array, but we treat them as MAIN path to allow load
2529                            leveling, which means that the counters will be updated. */
2530                         __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_sw,   /* local switch - used as a route-downgoing alg. start point */
2531                                                                        NULL,    /* prev. position switch */
2532                                                                        hca_lid, /* LID that we're routing to */
2533                                                                        p_sw->rank + 1,  /* rank of the LID that we're routing to */
2534                                                                        TRUE,    /* whether this HCA LID is real or dummy */
2535                                                                        TRUE);   /* whether this path to HCA should by tracked by counters */
2536                 }
2537                 /* done with all the port groups of this HCA - go to next HCA */
2538         }
2539
2540         OSM_LOG_EXIT(&p_ftree->p_osm->log);
2541 }                               /* __osm_ftree_fabric_route_to_non_cns() */
2542
2543 /***************************************************/
2544
2545 /*
2546  * Pseudo code:
2547  *    foreach switch in fabric
2548  *       obtain its LID
2549  *       set local LFT(LID) to port 0
2550  *       call assign-down-going-port-by-ascending-up(TRUE,FALSE) on CURRENT switch
2551  *
2552  * Routing to switch is similar to routing a REAL hca lid on SECONDARY path:
2553  *   - we should set fwd tables
2554  *   - we should NOT update port counters
2555  */
2556
2557 static void __osm_ftree_fabric_route_to_switches(IN ftree_fabric_t * p_ftree)
2558 {
2559         ftree_sw_t *p_sw;
2560         ftree_sw_t *p_next_sw;
2561
2562         OSM_LOG_ENTER(&p_ftree->p_osm->log);
2563
2564         p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
2565         while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
2566                 p_sw = p_next_sw;
2567                 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
2568
2569                 /* set local LFT(LID) to 0 (route to itself) */
2570                 p_sw->p_osm_sw->new_lft[cl_ntoh16(p_sw->base_lid)] = 0;
2571
2572                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2573                         "Switch %s (LID %u): routing switch-to-switch paths\n",
2574                         __osm_ftree_tuple_to_str(p_sw->tuple),
2575                         cl_ntoh16(p_sw->base_lid));
2576
2577                 /* set min hop table of the switch to itself */
2578                 __osm_ftree_sw_set_hops(p_sw, cl_ntoh16(p_sw->base_lid),
2579                                         0,      /* port_num */
2580                                         0);     /* hops     */
2581
2582                 __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_sw,   /* local switch - used as a route-downgoing alg. start point */
2583                                                                NULL,    /* prev. position switch */
2584                                                                p_sw->base_lid,  /* LID that we're routing to */
2585                                                                p_sw->rank,      /* rank of the LID that we're routing to */
2586                                                                TRUE,    /* whether the target LID is a real or dummy */
2587                                                                FALSE);  /* whether this path should by tracked by counters */
2588         }
2589
2590         OSM_LOG_EXIT(&p_ftree->p_osm->log);
2591 }                               /* __osm_ftree_fabric_route_to_switches() */
2592
2593 /***************************************************
2594  ***************************************************/
2595
2596 static int __osm_ftree_fabric_populate_nodes(IN ftree_fabric_t * p_ftree)
2597 {
2598         osm_node_t *p_osm_node;
2599         osm_node_t *p_next_osm_node;
2600
2601         OSM_LOG_ENTER(&p_ftree->p_osm->log);
2602
2603         p_next_osm_node =
2604             (osm_node_t *) cl_qmap_head(&p_ftree->p_osm->subn.node_guid_tbl);
2605         while (p_next_osm_node !=
2606                (osm_node_t *) cl_qmap_end(&p_ftree->p_osm->subn.
2607                                           node_guid_tbl)) {
2608                 p_osm_node = p_next_osm_node;
2609                 p_next_osm_node =
2610                     (osm_node_t *) cl_qmap_next(&p_osm_node->map_item);
2611                 switch (osm_node_get_type(p_osm_node)) {
2612                 case IB_NODE_TYPE_CA:
2613                         __osm_ftree_fabric_add_hca(p_ftree, p_osm_node);
2614                         break;
2615                 case IB_NODE_TYPE_ROUTER:
2616                         break;
2617                 case IB_NODE_TYPE_SWITCH:
2618                         __osm_ftree_fabric_add_sw(p_ftree, p_osm_node->sw);
2619                         break;
2620                 default:
2621                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB0E: "
2622                                 "Node GUID 0x%016" PRIx64
2623                                 " - Unknown node type: %s\n",
2624                                 cl_ntoh64(osm_node_get_node_guid(p_osm_node)),
2625                                 ib_get_node_type_str(osm_node_get_type
2626                                                      (p_osm_node)));
2627                         OSM_LOG_EXIT(&p_ftree->p_osm->log);
2628                         return -1;
2629                 }
2630         }
2631
2632         OSM_LOG_EXIT(&p_ftree->p_osm->log);
2633         return 0;
2634 }                               /* __osm_ftree_fabric_populate_nodes() */
2635
2636 /***************************************************
2637  ***************************************************/
2638
2639 static boolean_t __osm_ftree_sw_update_rank(IN ftree_sw_t * p_sw,
2640                                             IN uint32_t new_rank)
2641 {
2642         if (__osm_ftree_sw_ranked(p_sw) && p_sw->rank <= new_rank)
2643                 return FALSE;
2644         p_sw->rank = new_rank;
2645         return TRUE;
2646
2647 }
2648
2649 /***************************************************/
2650
2651 static void
2652 __osm_ftree_rank_switches_from_leafs(IN ftree_fabric_t * p_ftree,
2653                                      IN cl_list_t * p_ranking_bfs_list)
2654 {
2655         ftree_sw_t *p_sw;
2656         ftree_sw_t *p_remote_sw;
2657         osm_node_t *p_node;
2658         osm_node_t *p_remote_node;
2659         osm_physp_t *p_osm_port;
2660         uint8_t i;
2661         unsigned max_rank = 0;
2662
2663         while (!cl_is_list_empty(p_ranking_bfs_list)) {
2664                 p_sw = (ftree_sw_t *) cl_list_remove_head(p_ranking_bfs_list);
2665                 p_node = p_sw->p_osm_sw->p_node;
2666
2667                 /* note: skipping port 0 on switches */
2668                 for (i = 1; i < osm_node_get_num_physp(p_node); i++) {
2669                         p_osm_port = osm_node_get_physp_ptr(p_node, i);
2670                         if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
2671                                 continue;
2672
2673                         p_remote_node =
2674                             osm_node_get_remote_node(p_node, i, NULL);
2675                         if (!p_remote_node)
2676                                 continue;
2677                         if (osm_node_get_type(p_remote_node) !=
2678                             IB_NODE_TYPE_SWITCH)
2679                                 continue;
2680
2681                         p_remote_sw = __osm_ftree_fabric_get_sw_by_guid(p_ftree,
2682                                                                         osm_node_get_node_guid
2683                                                                         (p_remote_node));
2684                         if (!p_remote_sw) {
2685                                 /* remote node is not a switch */
2686                                 continue;
2687                         }
2688
2689                         /* if needed, rank the remote switch and add it to the BFS list */
2690                         if (__osm_ftree_sw_update_rank
2691                             (p_remote_sw, p_sw->rank + 1)) {
2692                                 max_rank = p_remote_sw->rank;
2693                                 cl_list_insert_tail(p_ranking_bfs_list,
2694                                                     p_remote_sw);
2695                         }
2696                 }
2697         }
2698
2699         /* set FatTree maximal switch rank */
2700         p_ftree->max_switch_rank = max_rank;
2701
2702 }                               /* __osm_ftree_rank_switches_from_leafs() */
2703
2704 /***************************************************/
2705
2706 static int
2707 __osm_ftree_rank_leaf_switches(IN ftree_fabric_t * p_ftree,
2708                                IN ftree_hca_t * p_hca,
2709                                IN cl_list_t * p_ranking_bfs_list)
2710 {
2711         ftree_sw_t *p_sw;
2712         osm_node_t *p_osm_node = p_hca->p_osm_node;
2713         osm_node_t *p_remote_osm_node;
2714         osm_physp_t *p_osm_port;
2715         static uint8_t i = 0;
2716         int res = 0;
2717
2718         OSM_LOG_ENTER(&p_ftree->p_osm->log);
2719
2720         for (i = 0; i < osm_node_get_num_physp(p_osm_node); i++) {
2721                 p_osm_port = osm_node_get_physp_ptr(p_osm_node, i);
2722                 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
2723                         continue;
2724
2725                 p_remote_osm_node =
2726                     osm_node_get_remote_node(p_osm_node, i, NULL);
2727                 if (!p_remote_osm_node)
2728                         continue;
2729
2730                 switch (osm_node_get_type(p_remote_osm_node)) {
2731                 case IB_NODE_TYPE_CA:
2732                         /* HCA connected directly to another HCA - not FatTree */
2733                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB0F: "
2734                                 "CA conected directly to another CA: "
2735                                 "0x%016" PRIx64 " <---> 0x%016" PRIx64 "\n",
2736                                 __osm_ftree_hca_get_guid_ho(p_hca),
2737                                 cl_ntoh64(osm_node_get_node_guid
2738                                           (p_remote_osm_node)));
2739                         res = -1;
2740                         goto Exit;
2741
2742                 case IB_NODE_TYPE_ROUTER:
2743                         /* leaving this port - proceeding to the next one */
2744                         continue;
2745
2746                 case IB_NODE_TYPE_SWITCH:
2747                         /* continue with this port */
2748                         break;
2749
2750                 default:
2751                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
2752                                 "ERR AB10: Node GUID 0x%016" PRIx64
2753                                 " - Unknown node type: %s\n",
2754                                 cl_ntoh64(osm_node_get_node_guid
2755                                           (p_remote_osm_node)),
2756                                 ib_get_node_type_str(osm_node_get_type
2757                                                      (p_remote_osm_node)));
2758                         res = -1;
2759                         goto Exit;
2760                 }
2761
2762                 /* remote node is switch */
2763
2764                 p_sw = __osm_ftree_fabric_get_sw_by_guid(p_ftree,
2765                                                          osm_node_get_node_guid
2766                                                          (p_osm_port->
2767                                                           p_remote_physp->
2768                                                           p_node));
2769                 CL_ASSERT(p_sw);
2770
2771                 /* if needed, rank the remote switch and add it to the BFS list */
2772
2773                 if (!__osm_ftree_sw_update_rank(p_sw, 0))
2774                         continue;
2775                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2776                         "Marking rank of switch that is directly connected to CA:\n"
2777                         "                                            - CA guid    : 0x%016"
2778                         PRIx64 "\n"
2779                         "                                            - Switch guid: 0x%016"
2780                         PRIx64 "\n"
2781                         "                                            - Switch LID : %u\n",
2782                         __osm_ftree_hca_get_guid_ho(p_hca),
2783                         __osm_ftree_sw_get_guid_ho(p_sw),
2784                         cl_ntoh16(p_sw->base_lid));
2785                 cl_list_insert_tail(p_ranking_bfs_list, p_sw);
2786         }
2787
2788 Exit:
2789         OSM_LOG_EXIT(&p_ftree->p_osm->log);
2790         return res;
2791 }                               /* __osm_ftree_rank_leaf_switches() */
2792
2793 /***************************************************/
2794
2795 static void __osm_ftree_sw_reverse_rank(IN cl_map_item_t * const p_map_item,
2796                                         IN void *context)
2797 {
2798         ftree_fabric_t *p_ftree = (ftree_fabric_t *) context;
2799         ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item;
2800         p_sw->rank = p_ftree->max_switch_rank - p_sw->rank;
2801 }
2802
2803 /***************************************************
2804  ***************************************************/
2805
2806 static int
2807 __osm_ftree_fabric_construct_hca_ports(IN ftree_fabric_t * p_ftree,
2808                                        IN ftree_hca_t * p_hca)
2809 {
2810         ftree_sw_t *p_remote_sw;
2811         osm_node_t *p_node = p_hca->p_osm_node;
2812         osm_node_t *p_remote_node;
2813         uint8_t remote_node_type;
2814         ib_net64_t remote_node_guid;
2815         osm_physp_t *p_remote_osm_port;
2816         uint8_t i;
2817         uint8_t remote_port_num;
2818         boolean_t is_cn = FALSE;
2819         int res = 0;
2820
2821         for (i = 0; i < osm_node_get_num_physp(p_node); i++) {
2822                 osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i);
2823                 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
2824                         continue;
2825
2826                 p_remote_osm_port = osm_physp_get_remote(p_osm_port);
2827                 p_remote_node =
2828                     osm_node_get_remote_node(p_node, i, &remote_port_num);
2829
2830                 if (!p_remote_osm_port)
2831                         continue;
2832
2833                 remote_node_type = osm_node_get_type(p_remote_node);
2834                 remote_node_guid = osm_node_get_node_guid(p_remote_node);
2835
2836                 switch (remote_node_type) {
2837                 case IB_NODE_TYPE_ROUTER:
2838                         /* leaving this port - proceeding to the next one */
2839                         continue;
2840
2841                 case IB_NODE_TYPE_CA:
2842                         /* HCA connected directly to another HCA - not FatTree */
2843                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB11: "
2844                                 "CA conected directly to another CA: "
2845                                 "0x%016" PRIx64 " <---> 0x%016" PRIx64 "\n",
2846                                 cl_ntoh64(osm_node_get_node_guid(p_node)),
2847                                 cl_ntoh64(remote_node_guid));
2848                         res = -1;
2849                         goto Exit;
2850
2851                 case IB_NODE_TYPE_SWITCH:
2852                         /* continue with this port */
2853                         break;
2854
2855                 default:
2856                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
2857                                 "ERR AB12: Node GUID 0x%016" PRIx64
2858                                 " - Unknown node type: %s\n",
2859                                 cl_ntoh64(remote_node_guid),
2860                                 ib_get_node_type_str(remote_node_type));
2861                         res = -1;
2862                         goto Exit;
2863                 }
2864
2865                 /* remote node is switch */
2866
2867                 p_remote_sw =
2868                     __osm_ftree_fabric_get_sw_by_guid(p_ftree,
2869                                                       remote_node_guid);
2870                 CL_ASSERT(p_remote_sw);
2871
2872                 /* If CN file is not supplied, then all the CAs considered as Compute Nodes.
2873                    Otherwise all the CAs are not CNs, and only guids that are present in the
2874                    CN file will be marked as compute nodes. */
2875                 if (!__osm_ftree_fabric_cns_provided(p_ftree)) {
2876                         is_cn = TRUE;
2877                 } else {
2878                         name_map_item_t *p_elem =
2879                             (name_map_item_t *) cl_qmap_get(&p_ftree->
2880                                                             cn_guid_tbl,
2881                                                             cl_ntoh64(osm_physp_get_port_guid
2882                                                             (p_osm_port)));
2883                         if (p_elem !=
2884                             (name_map_item_t *) cl_qmap_end(&p_ftree->
2885                                                             cn_guid_tbl))
2886                                 is_cn = TRUE;
2887                 }
2888
2889                 if (is_cn) {
2890                         p_ftree->cn_num++;
2891                         p_hca->cn_num++;
2892                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2893                                 "Marking CN port GUID 0x%016" PRIx64 "\n",
2894                                 cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
2895                 } else {
2896                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2897                                 "Marking non-CN port GUID 0x%016" PRIx64 "\n",
2898                                 cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
2899                 }
2900
2901                 __osm_ftree_hca_add_port(p_hca, /* local ftree_hca object */
2902                                          i,     /* local port number */
2903                                          remote_port_num,       /* remote port number */
2904                                          osm_node_get_base_lid(p_node, i),      /* local lid */
2905                                          osm_node_get_base_lid(p_remote_node, 0),       /* remote lid */
2906                                          osm_physp_get_port_guid(p_osm_port),   /* local port guid */
2907                                          osm_physp_get_port_guid(p_remote_osm_port),    /* remote port guid */
2908                                          remote_node_guid,      /* remote node guid */
2909                                          remote_node_type,      /* remote node type */
2910                                          (void *)p_remote_sw,   /* remote ftree_hca/sw object */
2911                                          is_cn);        /* whether this port is compute node */
2912         }
2913
2914 Exit:
2915         return res;
2916 }                               /* __osm_ftree_fabric_construct_hca_ports() */
2917
2918 /***************************************************
2919  ***************************************************/
2920
2921 static int __osm_ftree_fabric_construct_sw_ports(IN ftree_fabric_t * p_ftree,
2922                                                  IN ftree_sw_t * p_sw)
2923 {
2924         ftree_hca_t *p_remote_hca;
2925         ftree_sw_t *p_remote_sw;
2926         osm_node_t *p_node = p_sw->p_osm_sw->p_node;
2927         osm_node_t *p_remote_node;
2928         ib_net16_t remote_base_lid;
2929         uint8_t remote_node_type;
2930         ib_net64_t remote_node_guid;
2931         osm_physp_t *p_remote_osm_port;
2932         ftree_direction_t direction;
2933         void *p_remote_hca_or_sw;
2934         uint8_t i;
2935         uint8_t remote_port_num;
2936         int res = 0;
2937
2938         CL_ASSERT(osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH);
2939
2940         for (i = 1; i < osm_node_get_num_physp(p_node); i++) {
2941                 osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i);
2942                 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
2943                         continue;
2944
2945                 p_remote_osm_port = osm_physp_get_remote(p_osm_port);
2946                 if (!p_remote_osm_port)
2947                         continue;
2948
2949                 p_remote_node =
2950                     osm_node_get_remote_node(p_node, i, &remote_port_num);
2951
2952                 /* ignore any loopback connection on switch */
2953                 if (p_node == p_remote_node) {
2954                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2955                                 "Ignoring loopback on switch GUID 0x%016" PRIx64
2956                                 ", LID %u, rank %u\n",
2957                                 __osm_ftree_sw_get_guid_ho(p_sw),
2958                                 cl_ntoh16(p_sw->base_lid),
2959                                 p_sw->rank);
2960                         continue;
2961                 }
2962
2963                 remote_node_type = osm_node_get_type(p_remote_node);
2964                 remote_node_guid = osm_node_get_node_guid(p_remote_node);
2965
2966                 switch (remote_node_type) {
2967                 case IB_NODE_TYPE_ROUTER:
2968                         /* leaving this port - proceeding to the next one */
2969                         continue;
2970
2971                 case IB_NODE_TYPE_CA:
2972                         /* switch connected to hca */
2973
2974                         p_remote_hca =
2975                             __osm_ftree_fabric_get_hca_by_guid(p_ftree,
2976                                                                remote_node_guid);
2977                         CL_ASSERT(p_remote_hca);
2978
2979                         p_remote_hca_or_sw = (void *)p_remote_hca;
2980                         direction = FTREE_DIRECTION_DOWN;
2981
2982                         remote_base_lid =
2983                             osm_physp_get_base_lid(p_remote_osm_port);
2984                         break;
2985
2986                 case IB_NODE_TYPE_SWITCH:
2987                         /* switch connected to another switch */
2988
2989                         p_remote_sw =
2990                             __osm_ftree_fabric_get_sw_by_guid(p_ftree,
2991                                                               remote_node_guid);
2992                         CL_ASSERT(p_remote_sw);
2993
2994                         p_remote_hca_or_sw = (void *)p_remote_sw;
2995
2996                         if (abs(p_sw->rank - p_remote_sw->rank) != 1) {
2997                                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
2998                                         "ERR AB16: "
2999                                         "Illegal link between switches with ranks %u and %u:\n"
3000                                         "       GUID 0x%016" PRIx64
3001                                         ", LID %u, rank %u\n"
3002                                         "       GUID 0x%016" PRIx64
3003                                         ", LID %u, rank %u\n", p_sw->rank,
3004                                         p_remote_sw->rank,
3005                                         __osm_ftree_sw_get_guid_ho(p_sw),
3006                                         cl_ntoh16(p_sw->base_lid), p_sw->rank,
3007                                         __osm_ftree_sw_get_guid_ho(p_remote_sw),
3008                                         cl_ntoh16(p_remote_sw->base_lid),
3009                                         p_remote_sw->rank);
3010                                 res = -1;
3011                                 goto Exit;
3012                         }
3013
3014                         if (p_sw->rank > p_remote_sw->rank)
3015                                 direction = FTREE_DIRECTION_UP;
3016                         else
3017                                 direction = FTREE_DIRECTION_DOWN;
3018
3019                         /* switch LID is only in port 0 port_info structure */
3020                         remote_base_lid =
3021                             osm_node_get_base_lid(p_remote_node, 0);
3022
3023                         break;
3024
3025                 default:
3026                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3027                                 "ERR AB13: Node GUID 0x%016" PRIx64
3028                                 " - Unknown node type: %s\n",
3029                                 cl_ntoh64(remote_node_guid),
3030                                 ib_get_node_type_str(remote_node_type));
3031                         res = -1;
3032                         goto Exit;
3033                 }
3034                 __osm_ftree_sw_add_port(p_sw,   /* local ftree_sw object */
3035                                         i,      /* local port number */
3036                                         remote_port_num,        /* remote port number */
3037                                         p_sw->base_lid, /* local lid */
3038                                         remote_base_lid,        /* remote lid */
3039                                         osm_physp_get_port_guid(p_osm_port),    /* local port guid */
3040                                         osm_physp_get_port_guid(p_remote_osm_port),     /* remote port guid */
3041                                         remote_node_guid,       /* remote node guid */
3042                                         remote_node_type,       /* remote node type */
3043                                         p_remote_hca_or_sw,     /* remote ftree_hca/sw object */
3044                                         direction);     /* port direction (up or down) */
3045
3046                 /* Track the max lid (in host order) that exists in the fabric */
3047                 if (cl_ntoh16(remote_base_lid) > p_ftree->lft_max_lid_ho)
3048                         p_ftree->lft_max_lid_ho = cl_ntoh16(remote_base_lid);
3049         }
3050
3051 Exit:
3052         return res;
3053 }                               /* __osm_ftree_fabric_construct_sw_ports() */
3054
3055 /***************************************************
3056  ***************************************************/
3057
3058 static int __osm_ftree_fabric_rank_from_roots(IN ftree_fabric_t * p_ftree)
3059 {
3060         osm_node_t *p_osm_node;
3061         osm_node_t *p_remote_osm_node;
3062         osm_physp_t *p_osm_physp;
3063         ftree_sw_t *p_sw;
3064         ftree_sw_t *p_remote_sw;
3065         cl_list_t ranking_bfs_list;
3066         struct guid_list_item *item;
3067         int res = 0;
3068         unsigned num_roots;
3069         unsigned max_rank = 0;
3070         unsigned i;
3071
3072         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3073         cl_list_init(&ranking_bfs_list, 10);
3074
3075         /* Rank all the roots and add them to list */
3076         for (item = (void *)cl_qlist_head(&p_ftree->root_guid_list);
3077              item != (void *)cl_qlist_end(&p_ftree->root_guid_list);
3078              item = (void *)cl_qlist_next(&item->list)) {
3079                 p_sw =
3080                     __osm_ftree_fabric_get_sw_by_guid(p_ftree,
3081                                                       cl_hton64(item->guid));
3082                 if (!p_sw) {
3083                         /* the specified root guid wasn't found in the fabric */
3084                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB24: "
3085                                 "Root switch GUID 0x%" PRIx64 " not found\n",
3086                                 item->guid);
3087                         continue;
3088                 }
3089
3090                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3091                         "Ranking root switch with GUID 0x%" PRIx64 "\n",
3092                         item->guid);
3093                 p_sw->rank = 0;
3094                 cl_list_insert_tail(&ranking_bfs_list, p_sw);
3095         }
3096
3097         num_roots = cl_list_count(&ranking_bfs_list);
3098         if (!num_roots) {
3099                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB25: "
3100                         "No valid roots supplied\n");
3101                 res = -1;
3102                 goto Exit;
3103         }
3104
3105         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3106                 "Ranked %u valid root switches\n", num_roots);
3107
3108         /* Now the list has all the roots.
3109            BFS the subnet and update rank on all the switches. */
3110
3111         while (!cl_is_list_empty(&ranking_bfs_list)) {
3112                 p_sw = (ftree_sw_t *) cl_list_remove_head(&ranking_bfs_list);
3113                 p_osm_node = p_sw->p_osm_sw->p_node;
3114
3115                 /* note: skipping port 0 on switches */
3116                 for (i = 1; i < osm_node_get_num_physp(p_osm_node); i++) {
3117                         p_osm_physp = osm_node_get_physp_ptr(p_osm_node, i);
3118                         if (!p_osm_physp  || !osm_link_is_healthy(p_osm_physp))
3119                                 continue;
3120
3121                         p_remote_osm_node =
3122                             osm_node_get_remote_node(p_osm_node, i, NULL);
3123                         if (!p_remote_osm_node)
3124                                 continue;
3125
3126                         if (osm_node_get_type(p_remote_osm_node) !=
3127                             IB_NODE_TYPE_SWITCH)
3128                                 continue;
3129
3130                         p_remote_sw = __osm_ftree_fabric_get_sw_by_guid(p_ftree,
3131                                                                         osm_node_get_node_guid
3132                                                                         (p_remote_osm_node));
3133                         CL_ASSERT(p_remote_sw);
3134
3135                         /* if needed, rank the remote switch and add it to the BFS list */
3136                         if (__osm_ftree_sw_update_rank
3137                             (p_remote_sw, p_sw->rank + 1)) {
3138                                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3139                                         "Ranking switch 0x%" PRIx64
3140                                         " with rank %u\n",
3141                                         __osm_ftree_sw_get_guid_ho(p_remote_sw),
3142                                         p_remote_sw->rank);
3143                                 max_rank = p_remote_sw->rank;
3144                                 cl_list_insert_tail(&ranking_bfs_list,
3145                                                     p_remote_sw);
3146                         }
3147                 }
3148                 /* done with ports of this switch - go to the next switch in the list */
3149         }
3150
3151         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3152                 "Subnet ranking completed. Max Node Rank = %u\n", max_rank);
3153
3154         /* set FatTree maximal switch rank */
3155         p_ftree->max_switch_rank = max_rank;
3156
3157 Exit:
3158         cl_list_destroy(&ranking_bfs_list);
3159         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3160         return res;
3161 }                               /* __osm_ftree_fabric_rank_from_roots() */
3162
3163 /***************************************************
3164  ***************************************************/
3165
3166 static int __osm_ftree_fabric_rank_from_hcas(IN ftree_fabric_t * p_ftree)
3167 {
3168         ftree_hca_t *p_hca;
3169         ftree_hca_t *p_next_hca;
3170         cl_list_t ranking_bfs_list;
3171         int res = 0;
3172
3173         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3174
3175         cl_list_init(&ranking_bfs_list, 10);
3176
3177         /* Mark REVERSED rank of all the switches in the subnet.
3178            Start from switches that are connected to hca's, and
3179            scan all the switches in the subnet. */
3180         p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3181         while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3182                 p_hca = p_next_hca;
3183                 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3184                 if (__osm_ftree_rank_leaf_switches
3185                     (p_ftree, p_hca, &ranking_bfs_list) != 0) {
3186                         res = -1;
3187                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB14: "
3188                                 "Subnet ranking failed - subnet is not FatTree");
3189                         goto Exit;
3190                 }
3191         }
3192
3193         /* Now rank rest of the switches in the fabric, while the
3194            list already contains all the ranked leaf switches */
3195         __osm_ftree_rank_switches_from_leafs(p_ftree, &ranking_bfs_list);
3196
3197         /* fix ranking of the switches by reversing the ranking direction */
3198         cl_qmap_apply_func(&p_ftree->sw_tbl, __osm_ftree_sw_reverse_rank,
3199                            (void *)p_ftree);
3200
3201 Exit:
3202         cl_list_destroy(&ranking_bfs_list);
3203         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3204         return res;
3205 }                               /* __osm_ftree_fabric_rank_from_hcas() */
3206
3207 /***************************************************
3208  ***************************************************/
3209
3210 static int __osm_ftree_fabric_rank(IN ftree_fabric_t * p_ftree)
3211 {
3212         int res = 0;
3213
3214         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3215
3216         if (__osm_ftree_fabric_roots_provided(p_ftree))
3217                 res = __osm_ftree_fabric_rank_from_roots(p_ftree);
3218         else
3219                 res = __osm_ftree_fabric_rank_from_hcas(p_ftree);
3220
3221         if (res)
3222                 goto Exit;
3223
3224         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
3225                 "FatTree max switch rank is %u\n", p_ftree->max_switch_rank);
3226
3227 Exit:
3228         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3229         return res;
3230 }                               /* __osm_ftree_fabric_rank() */
3231
3232 /***************************************************
3233  ***************************************************/
3234
3235 static void __osm_ftree_fabric_set_leaf_rank(IN ftree_fabric_t * p_ftree)
3236 {
3237         unsigned i;
3238         ftree_sw_t *p_sw;
3239         ftree_hca_t *p_hca = NULL;
3240         ftree_hca_t *p_next_hca;
3241
3242         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3243
3244         if (!__osm_ftree_fabric_roots_provided(p_ftree)) {
3245                 /* If root file is not provided, the fabric has to be pure fat-tree
3246                    in terms of ranking. Thus, leaf switches rank is the max rank. */
3247                 p_ftree->leaf_switch_rank = p_ftree->max_switch_rank;
3248         } else {
3249                 /* Find the first CN and set the leaf_switch_rank to the rank
3250                    of the switch that is connected to this CN. Later we will
3251                    ensure that all the leaf switches have the same rank. */
3252                 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3253                 while (p_next_hca !=
3254                        (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3255                         p_hca = p_next_hca;
3256                         if (p_hca->cn_num)
3257                                 break;
3258                         p_next_hca =
3259                             (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3260                 }
3261                 /* we know that there are CNs in the fabric, so just to be sure... */
3262                 CL_ASSERT(p_next_hca !=
3263                           (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl));
3264
3265                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3266                         "Selected CN port GUID 0x%" PRIx64 "\n",
3267                         __osm_ftree_hca_get_guid_ho(p_hca));
3268
3269                 for (i = 0; (i < p_hca->up_port_groups_num)
3270                      && (!p_hca->up_port_groups[i]->is_cn); i++) ;
3271                 CL_ASSERT(i < p_hca->up_port_groups_num);
3272                 CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type ==
3273                           IB_NODE_TYPE_SWITCH);
3274
3275                 p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw;
3276                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3277                         "Selected leaf switch GUID 0x%" PRIx64 ", rank %u\n",
3278                         __osm_ftree_sw_get_guid_ho(p_sw), p_sw->rank);
3279                 p_ftree->leaf_switch_rank = p_sw->rank;
3280         }
3281
3282         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
3283                 "FatTree leaf switch rank is %u\n", p_ftree->leaf_switch_rank);
3284         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3285 }                               /* __osm_ftree_fabric_set_leaf_rank() */
3286
3287 /***************************************************
3288  ***************************************************/
3289
3290 static int __osm_ftree_fabric_populate_ports(IN ftree_fabric_t * p_ftree)
3291 {
3292         ftree_hca_t *p_hca;
3293         ftree_hca_t *p_next_hca;
3294         ftree_sw_t *p_sw;
3295         ftree_sw_t *p_next_sw;
3296         int res = 0;
3297
3298         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3299
3300         p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3301         while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3302                 p_hca = p_next_hca;
3303                 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3304                 if (__osm_ftree_fabric_construct_hca_ports(p_ftree, p_hca) != 0) {
3305                         res = -1;
3306                         goto Exit;
3307                 }
3308         }
3309
3310         p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3311         while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
3312                 p_sw = p_next_sw;
3313                 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
3314                 if (__osm_ftree_fabric_construct_sw_ports(p_ftree, p_sw) != 0) {
3315                         res = -1;
3316                         goto Exit;
3317                 }
3318         }
3319 Exit:
3320         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3321         return res;
3322 }                               /* __osm_ftree_fabric_populate_ports() */
3323
3324 /***************************************************
3325  ***************************************************/
3326 static int add_guid_item_to_list(void *cxt, uint64_t guid, char *p)
3327 {
3328         cl_qlist_t *list = cxt;
3329         struct guid_list_item *item;
3330
3331         item = malloc(sizeof(*item));
3332         if (!item)
3333                 return -1;
3334
3335         item->guid = guid;
3336         cl_qlist_insert_tail(list, &item->list);
3337
3338         return 0;
3339 }
3340
3341 static int add_guid_item_to_map(void *cxt, uint64_t guid, char *p)
3342 {
3343         cl_qmap_t *map = cxt;
3344         name_map_item_t *item;
3345
3346         item = malloc(sizeof(*item));
3347         if (!item)
3348                 return -1;
3349
3350         item->guid = guid;
3351         cl_qmap_insert(map, guid, &item->item);
3352
3353         return 0;
3354 }
3355
3356 static int __osm_ftree_fabric_read_guid_files(IN ftree_fabric_t * p_ftree)
3357 {
3358         int status = 0;
3359
3360         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3361
3362         if (__osm_ftree_fabric_roots_provided(p_ftree)) {
3363                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3364                         "Fetching root nodes from file %s\n",
3365                         p_ftree->p_osm->subn.opt.root_guid_file);
3366
3367                 if (parse_node_map(p_ftree->p_osm->subn.opt.root_guid_file,
3368                                    add_guid_item_to_list,
3369                                    &p_ftree->root_guid_list)) {
3370                         status = -1;
3371                         goto Exit;
3372                 }
3373
3374                 if (!cl_qlist_count(&p_ftree->root_guid_list)) {
3375                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB22: "
3376                                 "Root guids file has no valid guids\n");
3377                         status = -1;
3378                         goto Exit;
3379                 }
3380         }
3381
3382         if (__osm_ftree_fabric_cns_provided(p_ftree)) {
3383                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3384                         "Fetching compute nodes from file %s\n",
3385                         p_ftree->p_osm->subn.opt.cn_guid_file);
3386
3387                 if (parse_node_map(p_ftree->p_osm->subn.opt.cn_guid_file,
3388                                    add_guid_item_to_map,
3389                                    &p_ftree->cn_guid_tbl)) {
3390                         status = -1;
3391                         goto Exit;
3392                 }
3393
3394                 if (!cl_qmap_count(&p_ftree->cn_guid_tbl)) {
3395                         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB23: "
3396                                 "Compute node guids file has no valid guids\n");
3397                         status = -1;
3398                         goto Exit;
3399                 }
3400         }
3401
3402 Exit:
3403         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3404         return status;
3405 } /*__osm_ftree_fabric_read_guid_files() */
3406
3407 /***************************************************
3408  ***************************************************/
3409
3410 static int __osm_ftree_construct_fabric(IN void *context)
3411 {
3412         ftree_fabric_t *p_ftree = context;
3413         int status = 0;
3414
3415         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3416
3417         __osm_ftree_fabric_clear(p_ftree);
3418
3419         if (p_ftree->p_osm->subn.opt.lmc > 0) {
3420                 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3421                         "LMC > 0 is not supported by fat-tree routing.\n"
3422                         "Falling back to default routing\n");
3423                 status = -1;
3424                 goto Exit;
3425         }
3426
3427         if (cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl) < 2) {
3428                 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3429                         "Fabric has %u switches - topology is not fat-tree.\n"
3430                         "Falling back to default routing\n",
3431                         cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl));
3432                 status = -1;
3433                 goto Exit;
3434         }
3435
3436         if ((cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl) -
3437              cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl)) < 2) {
3438                 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3439                         "Fabric has %u nodes (%u switches) - topology is not fat-tree.\n"
3440                         "Falling back to default routing\n",
3441                         cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl),
3442                         cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl));
3443                 status = -1;
3444                 goto Exit;
3445         }
3446
3447         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n"
3448                 "                       |----------------------------------------|\n"
3449                 "                       |- Starting FatTree fabric construction -|\n"
3450                 "                       |----------------------------------------|\n\n");
3451
3452         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3453                 "Populating FatTree Switch and CA tables\n");
3454         if (__osm_ftree_fabric_populate_nodes(p_ftree) != 0) {
3455                 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3456                         "Fabric topology is not fat-tree - "
3457                         "falling back to default routing\n");
3458                 status = -1;
3459                 goto Exit;
3460         }
3461
3462         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3463                 "Reading guid files provided by user\n");
3464         if (__osm_ftree_fabric_read_guid_files(p_ftree) != 0) {
3465                 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3466                         "Failed reading guid files - "
3467                         "falling back to default routing\n");
3468                 status = -1;
3469                 goto Exit;
3470         }
3471
3472         if (cl_qmap_count(&p_ftree->hca_tbl) < 2) {
3473                 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3474                         "Fabric has %u CAa - topology is not fat-tree.\n"
3475                         "Falling back to default routing\n",
3476                         cl_qmap_count(&p_ftree->hca_tbl));
3477                 status = -1;
3478                 goto Exit;
3479         }
3480
3481         /* Rank all the switches in the fabric.
3482            After that we will know only fabric max switch rank.
3483            We will be able to check leaf switches rank and the
3484            whole tree rank after filling ports and marking CNs. */
3485         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "Ranking FatTree\n");
3486         if (__osm_ftree_fabric_rank(p_ftree) != 0) {
3487                 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3488                         "Failed ranking the tree\n");
3489                 status = -1;
3490                 goto Exit;
3491         }
3492
3493         /* For each hca and switch, construct array of ports.
3494            This is done after the whole FatTree data structure is ready,
3495            because we want the ports to have pointers to ftree_{sw,hca}_t
3496            objects, and we need the switches to be already ranked because
3497            that's how the port direction is determined. */
3498         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3499                 "Populating CA & switch ports\n");
3500         if (__osm_ftree_fabric_populate_ports(p_ftree) != 0) {
3501                 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3502                         "Fabric topology is not a fat-tree\n");
3503                 status = -1;
3504                 goto Exit;
3505         } else if (p_ftree->cn_num == 0) {
3506                 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3507                         "Fabric has no valid compute nodes\n");
3508                 status = -1;
3509                 goto Exit;
3510         }
3511
3512         /* Now that the CA ports have been created and CNs were marked,
3513            we can complete the fabric ranking - set leaf switches rank. */
3514         __osm_ftree_fabric_set_leaf_rank(p_ftree);
3515
3516         if (__osm_ftree_fabric_get_rank(p_ftree) > FAT_TREE_MAX_RANK ||
3517             __osm_ftree_fabric_get_rank(p_ftree) < FAT_TREE_MIN_RANK) {
3518                 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3519                         "Fabric rank is %u (should be between %u and %u)\n",
3520                         __osm_ftree_fabric_get_rank(p_ftree), FAT_TREE_MIN_RANK,
3521                         FAT_TREE_MAX_RANK);
3522                 status = -1;
3523                 goto Exit;
3524         }
3525
3526         /* Mark all the switches in the fabric with rank equal to
3527            p_ftree->leaf_switch_rank and that are also connected to CNs.
3528            As a by-product, this function also runs basic topology
3529            validation - it checks that all the CNs are at the same rank. */
3530         if (__osm_ftree_fabric_mark_leaf_switches(p_ftree)) {
3531                 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3532                         "Fabric topology is not a fat-tree\n");
3533                 status = -1;
3534                 goto Exit;
3535         }
3536
3537         /* Assign index to all the switches in the fabric.
3538            This function also sorts leaf switch array by the switch index,
3539            sorts all the port arrays of the indexed switches by remote
3540            switch index, and creates switch-by-tuple table (sw_by_tuple_tbl) */
3541         __osm_ftree_fabric_make_indexing(p_ftree);
3542
3543         /* Create leaf switch array sorted by index.
3544            This array contains switches with rank equal to p_ftree->leaf_switch_rank
3545            and that are also connected to CNs (REAL leafs), and it may contain
3546            switches at the same leaf rank w/o CNs, if this is the order of indexing.
3547            In any case, the first and the last switches in the array are REAL leafs. */
3548         if (__osm_ftree_fabric_create_leaf_switch_array(p_ftree)) {
3549                 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3550                         "Fabric topology is not a fat-tree\n");
3551                 status = -1;
3552                 goto Exit;
3553         }
3554
3555         /* calculate and set ftree.max_cn_per_leaf field */
3556         __osm_ftree_fabric_set_max_cn_per_leaf(p_ftree);
3557
3558         /* print general info about fabric topology */
3559         __osm_ftree_fabric_dump_general_info(p_ftree);
3560
3561         /* dump full tree topology */
3562         if (osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
3563                 __osm_ftree_fabric_dump(p_ftree);
3564
3565         /* the fabric is required to be PURE fat-tree only if the root
3566            guid file hasn't been provided by user */
3567         if (!__osm_ftree_fabric_roots_provided(p_ftree) &&
3568             !__osm_ftree_fabric_validate_topology(p_ftree)) {
3569                 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3570                         "Fabric topology is not a fat-tree\n");
3571                 status = -1;
3572                 goto Exit;
3573         }
3574
3575         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3576                 "Max LID in switch LFTs: %u\n",
3577                 p_ftree->lft_max_lid_ho);
3578
3579 Exit:
3580         if (status != 0) {
3581                 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3582                         "Clearing FatTree Fabric data structures\n");
3583                 __osm_ftree_fabric_clear(p_ftree);
3584         } else
3585                 p_ftree->fabric_built = TRUE;
3586
3587         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n"
3588                 "                       |--------------------------------------------------|\n"
3589                 "                       |- Done constructing FatTree fabric (status = %d) -|\n"
3590                 "                       |--------------------------------------------------|\n\n",
3591                 status);
3592
3593         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3594         return status;
3595 }                               /* __osm_ftree_construct_fabric() */
3596
3597 /***************************************************
3598  ***************************************************/
3599
3600 static int __osm_ftree_do_routing(IN void *context)
3601 {
3602         ftree_fabric_t *p_ftree = context;
3603         int status = 0;
3604
3605         OSM_LOG_ENTER(&p_ftree->p_osm->log);
3606
3607         if (!p_ftree->fabric_built) {
3608                 status = -1;
3609                 goto Exit;
3610         }
3611
3612         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3613                 "Starting FatTree routing\n");
3614
3615         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3616                 "Filling switch forwarding tables for Compute Nodes\n");
3617         __osm_ftree_fabric_route_to_cns(p_ftree);
3618
3619         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3620                 "Filling switch forwarding tables for non-CN targets\n");
3621         __osm_ftree_fabric_route_to_non_cns(p_ftree);
3622
3623         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3624                 "Filling switch forwarding tables for switch-to-switch paths\n");
3625         __osm_ftree_fabric_route_to_switches(p_ftree);
3626
3627         /* for each switch, set its fwd table */
3628         cl_qmap_apply_func(&p_ftree->sw_tbl, __osm_ftree_set_sw_fwd_table,
3629                            (void *)p_ftree);
3630
3631         /* write out hca ordering file */
3632         __osm_ftree_fabric_dump_hca_ordering(p_ftree);
3633
3634         OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3635                 "FatTree routing is done\n");
3636
3637 Exit:
3638         OSM_LOG_EXIT(&p_ftree->p_osm->log);
3639         return status;
3640 }
3641
3642 /***************************************************
3643  ***************************************************/
3644
3645 static void __osm_ftree_delete(IN void *context)
3646 {
3647         if (!context)
3648                 return;
3649         __osm_ftree_fabric_destroy((ftree_fabric_t *) context);
3650 }
3651
3652 /***************************************************
3653  ***************************************************/
3654
3655 int osm_ucast_ftree_setup(struct osm_routing_engine *r, osm_opensm_t * p_osm)
3656 {
3657         ftree_fabric_t *p_ftree = __osm_ftree_fabric_create();
3658         if (!p_ftree)
3659                 return -1;
3660
3661         p_ftree->p_osm = p_osm;
3662
3663         r->context = (void *)p_ftree;
3664         r->build_lid_matrices = __osm_ftree_construct_fabric;
3665         r->ucast_build_fwd_tables = __osm_ftree_do_routing;
3666         r->delete = __osm_ftree_delete;
3667
3668         return 0;
3669 }