2 * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
3 * Copyright (c) 2002-2007 Mellanox Technologies LTD. All rights reserved.
4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
6 * This software is available to you under a choice of one of two
7 * licenses. You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
12 * Redistribution and use in source and binary forms, with or
13 * without modification, are permitted provided that the following
16 * - Redistributions of source code must retain the above
17 * copyright notice, this list of conditions and the following
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials
23 * provided with the distribution.
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
38 * Implementation of OpenSM FatTree routing
49 #include <iba/ib_types.h>
50 #include <complib/cl_qmap.h>
51 #include <complib/cl_debug.h>
52 #include <opensm/osm_opensm.h>
53 #include <opensm/osm_switch.h>
56 * FatTree rank is bounded between 2 and 8:
57 * - Tree of rank 1 has only trivial routing paths,
58 * so no need to use FatTree routing.
59 * - Why maximum rank is 8:
60 * Each node (switch) is assigned a unique tuple.
61 * Switches are stored in two cl_qmaps - one is
62 * ordered by guid, and the other by a key that is
63 * generated from tuple. Since cl_qmap supports only
64 * a 64-bit key, the maximal tuple lenght is 8 bytes.
65 * which means that maximal tree rank is 8.
66 * Note that the above also implies that each switch
67 * can have at max 255 up/down ports.
70 #define FAT_TREE_MIN_RANK 2
71 #define FAT_TREE_MAX_RANK 8
74 FTREE_DIRECTION_DOWN = -1,
79 /***************************************************
83 ***************************************************/
88 struct ftree_port_group_t_;
89 struct ftree_fabric_t_;
91 /***************************************************
93 ** ftree_tuple_t definition
95 ***************************************************/
97 #define FTREE_TUPLE_BUFF_LEN 1024
98 #define FTREE_TUPLE_LEN 8
100 typedef uint8_t ftree_tuple_t[FTREE_TUPLE_LEN];
101 typedef uint64_t ftree_tuple_key_t;
103 struct guid_list_item {
108 /***************************************************
110 ** ftree_sw_table_element_t definition
112 ***************************************************/
115 cl_map_item_t map_item;
116 struct ftree_sw_t_ *p_sw;
117 } ftree_sw_tbl_element_t;
119 /***************************************************
121 ** ftree_port_t definition
123 ***************************************************/
125 typedef struct ftree_port_t_ {
126 cl_map_item_t map_item;
127 uint8_t port_num; /* port number on the current node */
128 uint8_t remote_port_num; /* port number on the remote node */
129 uint32_t counter_up; /* number of allocated routs upwards */
130 uint32_t counter_down; /* number of allocated routs downwards */
133 /***************************************************
135 ** ftree_port_group_t definition
137 ***************************************************/
139 typedef union ftree_hca_or_sw_ {
140 struct ftree_hca_t_ *p_hca;
141 struct ftree_sw_t_ *p_sw;
144 typedef struct ftree_port_group_t_ {
145 cl_map_item_t map_item;
146 ib_net16_t base_lid; /* base lid of the current node */
147 ib_net16_t remote_base_lid; /* base lid of the remote node */
148 ib_net64_t port_guid; /* port guid of this port */
149 ib_net64_t node_guid; /* this node's guid */
150 uint8_t node_type; /* this node's type */
151 ib_net64_t remote_port_guid; /* port guid of the remote port */
152 ib_net64_t remote_node_guid; /* node guid of the remote node */
153 uint8_t remote_node_type; /* IB_NODE_TYPE_{CA,SWITCH,ROUTER,...} */
154 ftree_hca_or_sw hca_or_sw; /* pointer to this hca/switch */
155 ftree_hca_or_sw remote_hca_or_sw; /* pointer to remote hca/switch */
156 cl_ptr_vector_t ports; /* vector of ports to the same lid */
157 boolean_t is_cn; /* whether this port is a compute node */
158 uint32_t counter_down; /* number of allocated routs downwards */
159 } ftree_port_group_t;
161 /***************************************************
163 ** ftree_sw_t definition
165 ***************************************************/
167 typedef struct ftree_sw_t_ {
168 cl_map_item_t map_item;
169 osm_switch_t *p_osm_sw;
173 ftree_port_group_t **down_port_groups;
174 uint8_t down_port_groups_num;
175 ftree_port_group_t **up_port_groups;
176 uint8_t up_port_groups_num;
178 int down_port_groups_idx;
181 /***************************************************
183 ** ftree_hca_t definition
185 ***************************************************/
187 typedef struct ftree_hca_t_ {
188 cl_map_item_t map_item;
189 osm_node_t *p_osm_node;
190 ftree_port_group_t **up_port_groups;
191 uint16_t up_port_groups_num;
195 /***************************************************
197 ** ftree_fabric_t definition
199 ***************************************************/
201 typedef struct ftree_fabric_t_ {
205 cl_qmap_t sw_by_tuple_tbl;
206 cl_qlist_t root_guid_list;
207 cl_qmap_t cn_guid_tbl;
209 uint8_t leaf_switch_rank;
210 uint8_t max_switch_rank;
211 ftree_sw_t **leaf_switches;
212 uint32_t leaf_switches_num;
213 uint16_t max_cn_per_leaf;
214 uint16_t lft_max_lid_ho;
215 boolean_t fabric_built;
218 /***************************************************
222 ***************************************************/
224 static int OSM_CDECL __osm_ftree_compare_switches_by_index(IN const void *p1,
227 ftree_sw_t **pp_sw1 = (ftree_sw_t **) p1;
228 ftree_sw_t **pp_sw2 = (ftree_sw_t **) p2;
231 for (i = 0; i < FTREE_TUPLE_LEN; i++) {
232 if ((*pp_sw1)->tuple[i] > (*pp_sw2)->tuple[i])
234 if ((*pp_sw1)->tuple[i] < (*pp_sw2)->tuple[i])
240 /***************************************************/
243 __osm_ftree_compare_port_groups_by_remote_switch_index(IN const void *p1,
246 ftree_port_group_t **pp_g1 = (ftree_port_group_t **) p1;
247 ftree_port_group_t **pp_g2 = (ftree_port_group_t **) p2;
250 __osm_ftree_compare_switches_by_index(&
251 ((*pp_g1)->remote_hca_or_sw.
253 &((*pp_g2)->remote_hca_or_sw.
257 /***************************************************
259 ** ftree_tuple_t functions
261 ***************************************************/
263 static void __osm_ftree_tuple_init(IN ftree_tuple_t tuple)
265 memset(tuple, 0xFF, FTREE_TUPLE_LEN);
268 /***************************************************/
270 static inline boolean_t __osm_ftree_tuple_assigned(IN ftree_tuple_t tuple)
272 return (tuple[0] != 0xFF);
275 /***************************************************/
277 #define FTREE_TUPLE_BUFFERS_NUM 6
279 static char *__osm_ftree_tuple_to_str(IN ftree_tuple_t tuple)
281 static char buffer[FTREE_TUPLE_BUFFERS_NUM][FTREE_TUPLE_BUFF_LEN];
282 static uint8_t ind = 0;
286 if (!__osm_ftree_tuple_assigned(tuple))
287 return "INDEX.NOT.ASSIGNED";
289 buffer[ind][0] = '\0';
291 for (i = 0; (i < FTREE_TUPLE_LEN) && (tuple[i] != 0xFF); i++) {
292 if ((strlen(buffer[ind]) + 10) > FTREE_TUPLE_BUFF_LEN)
293 return "INDEX.TOO.LONG";
295 strcat(buffer[ind], ".");
296 sprintf(&buffer[ind][strlen(buffer[ind])], "%u", tuple[i]);
299 ret_buffer = buffer[ind];
300 ind = (ind + 1) % FTREE_TUPLE_BUFFERS_NUM;
302 } /* __osm_ftree_tuple_to_str() */
304 /***************************************************/
306 static inline ftree_tuple_key_t __osm_ftree_tuple_to_key(IN ftree_tuple_t tuple)
308 ftree_tuple_key_t key;
309 memcpy(&key, tuple, FTREE_TUPLE_LEN);
313 /***************************************************/
315 static inline void __osm_ftree_tuple_from_key(IN ftree_tuple_t tuple,
316 IN ftree_tuple_key_t key)
318 memcpy(tuple, &key, FTREE_TUPLE_LEN);
321 /***************************************************
323 ** ftree_sw_tbl_element_t functions
325 ***************************************************/
327 static ftree_sw_tbl_element_t *__osm_ftree_sw_tbl_element_create(IN ftree_sw_t *
330 ftree_sw_tbl_element_t *p_element =
331 (ftree_sw_tbl_element_t *) malloc(sizeof(ftree_sw_tbl_element_t));
334 memset(p_element, 0, sizeof(ftree_sw_tbl_element_t));
336 p_element->p_sw = p_sw;
340 /***************************************************/
342 static void __osm_ftree_sw_tbl_element_destroy(IN ftree_sw_tbl_element_t *
350 /***************************************************
352 ** ftree_port_t functions
354 ***************************************************/
356 static ftree_port_t *__osm_ftree_port_create(IN uint8_t port_num,
357 IN uint8_t remote_port_num)
359 ftree_port_t *p_port = (ftree_port_t *) malloc(sizeof(ftree_port_t));
362 memset(p_port, 0, sizeof(ftree_port_t));
364 p_port->port_num = port_num;
365 p_port->remote_port_num = remote_port_num;
370 /***************************************************/
372 static void __osm_ftree_port_destroy(IN ftree_port_t * p_port)
378 /***************************************************
380 ** ftree_port_group_t functions
382 ***************************************************/
384 static ftree_port_group_t *
385 __osm_ftree_port_group_create(IN ib_net16_t base_lid,
386 IN ib_net16_t remote_base_lid,
387 IN ib_net64_t port_guid,
388 IN ib_net64_t node_guid,
389 IN uint8_t node_type,
390 IN void *p_hca_or_sw,
391 IN ib_net64_t remote_port_guid,
392 IN ib_net64_t remote_node_guid,
393 IN uint8_t remote_node_type,
394 IN void *p_remote_hca_or_sw,
397 ftree_port_group_t *p_group =
398 (ftree_port_group_t *) malloc(sizeof(ftree_port_group_t));
401 memset(p_group, 0, sizeof(ftree_port_group_t));
403 p_group->base_lid = base_lid;
404 p_group->remote_base_lid = remote_base_lid;
405 memcpy(&p_group->port_guid, &port_guid, sizeof(ib_net64_t));
406 memcpy(&p_group->node_guid, &node_guid, sizeof(ib_net64_t));
407 memcpy(&p_group->remote_port_guid, &remote_port_guid,
409 memcpy(&p_group->remote_node_guid, &remote_node_guid,
412 p_group->node_type = node_type;
414 case IB_NODE_TYPE_CA:
415 p_group->hca_or_sw.p_hca = (ftree_hca_t *) p_hca_or_sw;
417 case IB_NODE_TYPE_SWITCH:
418 p_group->hca_or_sw.p_sw = (ftree_sw_t *) p_hca_or_sw;
421 /* we shouldn't get here - port is created only in hca or switch */
425 p_group->remote_node_type = remote_node_type;
426 switch (remote_node_type) {
427 case IB_NODE_TYPE_CA:
428 p_group->remote_hca_or_sw.p_hca =
429 (ftree_hca_t *) p_remote_hca_or_sw;
431 case IB_NODE_TYPE_SWITCH:
432 p_group->remote_hca_or_sw.p_sw =
433 (ftree_sw_t *) p_remote_hca_or_sw;
436 /* we shouldn't get here - port is created only in hca or switch */
440 cl_ptr_vector_init(&p_group->ports, 0, /* min size */
442 p_group->is_cn = is_cn;
444 } /* __osm_ftree_port_group_create() */
446 /***************************************************/
448 static void __osm_ftree_port_group_destroy(IN ftree_port_group_t * p_group)
452 ftree_port_t *p_port;
457 /* remove all the elements of p_group->ports vector */
458 size = cl_ptr_vector_get_size(&p_group->ports);
459 for (i = 0; i < size; i++) {
460 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
461 __osm_ftree_port_destroy(p_port);
463 cl_ptr_vector_destroy(&p_group->ports);
465 } /* __osm_ftree_port_group_destroy() */
467 /***************************************************/
470 __osm_ftree_port_group_dump(IN ftree_fabric_t * p_ftree,
471 IN ftree_port_group_t * p_group,
472 IN ftree_direction_t direction)
474 ftree_port_t *p_port;
477 char buff[10 * 1024];
482 if (!osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
485 size = cl_ptr_vector_get_size(&p_group->ports);
488 for (i = 0; i < size; i++) {
489 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
494 sprintf(buff + strlen(buff), "%u", p_port->port_num);
497 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
498 " Port Group of size %u, port(s): %s, direction: %s\n"
499 " Local <--> Remote GUID (LID):"
500 "0x%016" PRIx64 " (0x%04x) <--> 0x%016" PRIx64 " (0x%04x)\n",
503 (direction == FTREE_DIRECTION_DOWN) ? "DOWN" : "UP",
504 cl_ntoh64(p_group->port_guid),
505 cl_ntoh16(p_group->base_lid),
506 cl_ntoh64(p_group->remote_port_guid),
507 cl_ntoh16(p_group->remote_base_lid));
509 } /* __osm_ftree_port_group_dump() */
511 /***************************************************/
514 __osm_ftree_port_group_add_port(IN ftree_port_group_t * p_group,
515 IN uint8_t port_num, IN uint8_t remote_port_num)
518 ftree_port_t *p_port;
520 for (i = 0; i < cl_ptr_vector_get_size(&p_group->ports); i++) {
521 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
522 if (p_port->port_num == port_num)
526 p_port = __osm_ftree_port_create(port_num, remote_port_num);
527 cl_ptr_vector_insert(&p_group->ports, p_port, NULL);
530 /***************************************************
532 ** ftree_sw_t functions
534 ***************************************************/
536 static ftree_sw_t *__osm_ftree_sw_create(IN ftree_fabric_t * p_ftree,
537 IN osm_switch_t * p_osm_sw)
542 /* make sure that the switch has ports */
543 if (p_osm_sw->num_ports == 1)
546 p_sw = (ftree_sw_t *) malloc(sizeof(ftree_sw_t));
549 memset(p_sw, 0, sizeof(ftree_sw_t));
551 p_sw->p_osm_sw = p_osm_sw;
552 p_sw->rank = 0xFFFFFFFF;
553 __osm_ftree_tuple_init(p_sw->tuple);
555 p_sw->base_lid = osm_node_get_base_lid(p_sw->p_osm_sw->p_node, 0);
557 ports_num = osm_node_get_num_physp(p_sw->p_osm_sw->p_node);
558 p_sw->down_port_groups =
559 (ftree_port_group_t **) malloc(ports_num *
560 sizeof(ftree_port_group_t *));
561 p_sw->up_port_groups =
562 (ftree_port_group_t **) malloc(ports_num *
563 sizeof(ftree_port_group_t *));
564 if (!p_sw->down_port_groups || !p_sw->up_port_groups)
566 p_sw->down_port_groups_num = 0;
567 p_sw->up_port_groups_num = 0;
569 /* initialize lft buffer */
570 memset(p_osm_sw->new_lft, OSM_NO_PATH, IB_LID_UCAST_END_HO + 1);
572 p_sw->down_port_groups_idx = -1;
575 } /* __osm_ftree_sw_create() */
577 /***************************************************/
579 static void __osm_ftree_sw_destroy(IN ftree_fabric_t * p_ftree,
580 IN ftree_sw_t * p_sw)
587 for (i = 0; i < p_sw->down_port_groups_num; i++)
588 __osm_ftree_port_group_destroy(p_sw->down_port_groups[i]);
589 for (i = 0; i < p_sw->up_port_groups_num; i++)
590 __osm_ftree_port_group_destroy(p_sw->up_port_groups[i]);
591 if (p_sw->down_port_groups)
592 free(p_sw->down_port_groups);
593 if (p_sw->up_port_groups)
594 free(p_sw->up_port_groups);
597 } /* __osm_ftree_sw_destroy() */
599 /***************************************************/
601 static uint64_t __osm_ftree_sw_get_guid_no(IN ftree_sw_t * p_sw)
605 return osm_node_get_node_guid(p_sw->p_osm_sw->p_node);
608 /***************************************************/
610 static uint64_t __osm_ftree_sw_get_guid_ho(IN ftree_sw_t * p_sw)
612 return cl_ntoh64(__osm_ftree_sw_get_guid_no(p_sw));
615 /***************************************************/
617 static void __osm_ftree_sw_dump(IN ftree_fabric_t * p_ftree,
618 IN ftree_sw_t * p_sw)
625 if (!osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
628 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
629 "Switch index: %s, GUID: 0x%016" PRIx64
630 ", Ports: %u DOWN, %u UP\n",
631 __osm_ftree_tuple_to_str(p_sw->tuple),
632 __osm_ftree_sw_get_guid_ho(p_sw), p_sw->down_port_groups_num,
633 p_sw->up_port_groups_num);
635 for (i = 0; i < p_sw->down_port_groups_num; i++)
636 __osm_ftree_port_group_dump(p_ftree,
637 p_sw->down_port_groups[i],
638 FTREE_DIRECTION_DOWN);
639 for (i = 0; i < p_sw->up_port_groups_num; i++)
640 __osm_ftree_port_group_dump(p_ftree, p_sw->up_port_groups[i],
643 } /* __osm_ftree_sw_dump() */
645 /***************************************************/
647 static boolean_t __osm_ftree_sw_ranked(IN ftree_sw_t * p_sw)
649 return (p_sw->rank != 0xFFFFFFFF);
652 /***************************************************/
654 static ftree_port_group_t *
655 __osm_ftree_sw_get_port_group_by_remote_lid(IN ftree_sw_t * p_sw,
656 IN ib_net16_t remote_base_lid,
657 IN ftree_direction_t direction)
661 ftree_port_group_t **port_groups;
663 if (direction == FTREE_DIRECTION_UP) {
664 port_groups = p_sw->up_port_groups;
665 size = p_sw->up_port_groups_num;
667 port_groups = p_sw->down_port_groups;
668 size = p_sw->down_port_groups_num;
671 for (i = 0; i < size; i++)
672 if (remote_base_lid == port_groups[i]->remote_base_lid)
673 return port_groups[i];
676 } /* __osm_ftree_sw_get_port_group_by_remote_lid() */
678 /***************************************************/
681 __osm_ftree_sw_add_port(IN ftree_sw_t * p_sw,
683 IN uint8_t remote_port_num,
684 IN ib_net16_t base_lid,
685 IN ib_net16_t remote_base_lid,
686 IN ib_net64_t port_guid,
687 IN ib_net64_t remote_port_guid,
688 IN ib_net64_t remote_node_guid,
689 IN uint8_t remote_node_type,
690 IN void *p_remote_hca_or_sw,
691 IN ftree_direction_t direction)
693 ftree_port_group_t *p_group =
694 __osm_ftree_sw_get_port_group_by_remote_lid(p_sw, remote_base_lid,
698 p_group = __osm_ftree_port_group_create(base_lid,
701 __osm_ftree_sw_get_guid_no
704 p_sw, remote_port_guid,
711 if (direction == FTREE_DIRECTION_UP)
712 p_sw->up_port_groups[p_sw->up_port_groups_num++] =
715 p_sw->down_port_groups[p_sw->down_port_groups_num++] =
718 __osm_ftree_port_group_add_port(p_group, port_num, remote_port_num);
720 } /* __osm_ftree_sw_add_port() */
722 /***************************************************/
724 static inline cl_status_t
725 __osm_ftree_sw_set_hops(IN ftree_sw_t * p_sw,
726 IN uint16_t lid_ho, IN uint8_t port_num,
729 /* set local min hop table(LID) */
730 return osm_switch_set_hops(p_sw->p_osm_sw, lid_ho, port_num, hops);
733 /***************************************************
735 ** ftree_hca_t functions
737 ***************************************************/
739 static ftree_hca_t *__osm_ftree_hca_create(IN osm_node_t * p_osm_node)
741 ftree_hca_t *p_hca = (ftree_hca_t *) malloc(sizeof(ftree_hca_t));
744 memset(p_hca, 0, sizeof(ftree_hca_t));
746 p_hca->p_osm_node = p_osm_node;
747 p_hca->up_port_groups = (ftree_port_group_t **)
748 malloc(osm_node_get_num_physp(p_hca->p_osm_node) *
749 sizeof(ftree_port_group_t *));
750 if (!p_hca->up_port_groups)
752 p_hca->up_port_groups_num = 0;
756 /***************************************************/
758 static void __osm_ftree_hca_destroy(IN ftree_hca_t * p_hca)
765 for (i = 0; i < p_hca->up_port_groups_num; i++)
766 __osm_ftree_port_group_destroy(p_hca->up_port_groups[i]);
768 if (p_hca->up_port_groups)
769 free(p_hca->up_port_groups);
774 /***************************************************/
776 static uint64_t __osm_ftree_hca_get_guid_no(IN ftree_hca_t * p_hca)
780 return osm_node_get_node_guid(p_hca->p_osm_node);
783 /***************************************************/
785 static uint64_t __osm_ftree_hca_get_guid_ho(IN ftree_hca_t * p_hca)
787 return cl_ntoh64(__osm_ftree_hca_get_guid_no(p_hca));
790 /***************************************************/
792 static void __osm_ftree_hca_dump(IN ftree_fabric_t * p_ftree,
793 IN ftree_hca_t * p_hca)
800 if (!osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
803 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
804 "CA GUID: 0x%016" PRIx64 ", Ports: %u UP\n",
805 __osm_ftree_hca_get_guid_ho(p_hca), p_hca->up_port_groups_num);
807 for (i = 0; i < p_hca->up_port_groups_num; i++)
808 __osm_ftree_port_group_dump(p_ftree, p_hca->up_port_groups[i],
812 /***************************************************/
814 static ftree_port_group_t *
815 __osm_ftree_hca_get_port_group_by_remote_lid(IN ftree_hca_t * p_hca,
816 IN ib_net16_t remote_base_lid)
819 for (i = 0; i < p_hca->up_port_groups_num; i++)
820 if (remote_base_lid ==
821 p_hca->up_port_groups[i]->remote_base_lid)
822 return p_hca->up_port_groups[i];
827 /***************************************************/
830 __osm_ftree_hca_add_port(IN ftree_hca_t * p_hca,
832 IN uint8_t remote_port_num,
833 IN ib_net16_t base_lid,
834 IN ib_net16_t remote_base_lid,
835 IN ib_net64_t port_guid,
836 IN ib_net64_t remote_port_guid,
837 IN ib_net64_t remote_node_guid,
838 IN uint8_t remote_node_type,
839 IN void *p_remote_hca_or_sw, IN boolean_t is_cn)
841 ftree_port_group_t *p_group;
843 /* this function is supposed to be called only for adding ports
844 in hca's that lead to switches */
845 CL_ASSERT(remote_node_type == IB_NODE_TYPE_SWITCH);
848 __osm_ftree_hca_get_port_group_by_remote_lid(p_hca,
852 p_group = __osm_ftree_port_group_create(base_lid,
855 __osm_ftree_hca_get_guid_no
857 IB_NODE_TYPE_CA, p_hca,
863 p_hca->up_port_groups[p_hca->up_port_groups_num++] = p_group;
865 __osm_ftree_port_group_add_port(p_group, port_num, remote_port_num);
867 } /* __osm_ftree_hca_add_port() */
869 /***************************************************
871 ** ftree_fabric_t functions
873 ***************************************************/
875 static ftree_fabric_t *__osm_ftree_fabric_create()
877 ftree_fabric_t *p_ftree =
878 (ftree_fabric_t *) malloc(sizeof(ftree_fabric_t));
882 memset(p_ftree, 0, sizeof(ftree_fabric_t));
884 cl_qmap_init(&p_ftree->hca_tbl);
885 cl_qmap_init(&p_ftree->sw_tbl);
886 cl_qmap_init(&p_ftree->sw_by_tuple_tbl);
887 cl_qmap_init(&p_ftree->cn_guid_tbl);
889 cl_qlist_init(&p_ftree->root_guid_list);
894 /***************************************************/
896 static void __osm_ftree_fabric_clear(ftree_fabric_t * p_ftree)
899 ftree_hca_t *p_next_hca;
901 ftree_sw_t *p_next_sw;
902 ftree_sw_tbl_element_t *p_element;
903 ftree_sw_tbl_element_t *p_next_element;
904 name_map_item_t *p_guid_element, *p_next_guid_element;
909 /* remove all the elements of hca_tbl */
911 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
912 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
914 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
915 __osm_ftree_hca_destroy(p_hca);
917 cl_qmap_remove_all(&p_ftree->hca_tbl);
919 /* remove all the elements of sw_tbl */
921 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
922 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
924 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
925 __osm_ftree_sw_destroy(p_ftree, p_sw);
927 cl_qmap_remove_all(&p_ftree->sw_tbl);
929 /* remove all the elements of sw_by_tuple_tbl */
932 (ftree_sw_tbl_element_t *) cl_qmap_head(&p_ftree->sw_by_tuple_tbl);
933 while (p_next_element !=
934 (ftree_sw_tbl_element_t *) cl_qmap_end(&p_ftree->
936 p_element = p_next_element;
938 (ftree_sw_tbl_element_t *) cl_qmap_next(&p_element->
940 __osm_ftree_sw_tbl_element_destroy(p_element);
942 cl_qmap_remove_all(&p_ftree->sw_by_tuple_tbl);
944 /* remove all the elements of cn_guid_tbl */
945 p_next_guid_element =
946 (name_map_item_t *) cl_qmap_head(&p_ftree->cn_guid_tbl);
947 while (p_next_guid_element !=
948 (name_map_item_t *) cl_qmap_end(&p_ftree->cn_guid_tbl)) {
949 p_guid_element = p_next_guid_element;
950 p_next_guid_element =
951 (name_map_item_t *) cl_qmap_next(&p_guid_element->item);
952 free(p_guid_element);
954 cl_qmap_remove_all(&p_ftree->cn_guid_tbl);
956 /* remove all the elements of root_guid_list */
957 while (!cl_is_qlist_empty(&p_ftree->root_guid_list))
958 free(cl_qlist_remove_head(&p_ftree->root_guid_list));
960 /* free the leaf switches array */
961 if ((p_ftree->leaf_switches_num > 0) && (p_ftree->leaf_switches))
962 free(p_ftree->leaf_switches);
964 p_ftree->leaf_switches_num = 0;
966 p_ftree->leaf_switch_rank = 0;
967 p_ftree->max_switch_rank = 0;
968 p_ftree->max_cn_per_leaf = 0;
969 p_ftree->lft_max_lid_ho = 0;
970 p_ftree->leaf_switches = NULL;
971 p_ftree->fabric_built = FALSE;
973 } /* __osm_ftree_fabric_destroy() */
975 /***************************************************/
977 static void __osm_ftree_fabric_destroy(ftree_fabric_t * p_ftree)
981 __osm_ftree_fabric_clear(p_ftree);
985 /***************************************************/
987 static uint8_t __osm_ftree_fabric_get_rank(ftree_fabric_t * p_ftree)
989 return p_ftree->leaf_switch_rank + 1;
992 /***************************************************/
994 static void __osm_ftree_fabric_add_hca(ftree_fabric_t * p_ftree,
995 osm_node_t * p_osm_node)
997 ftree_hca_t *p_hca = __osm_ftree_hca_create(p_osm_node);
999 CL_ASSERT(osm_node_get_type(p_osm_node) == IB_NODE_TYPE_CA);
1001 cl_qmap_insert(&p_ftree->hca_tbl, p_osm_node->node_info.node_guid,
1005 /***************************************************/
1007 static void __osm_ftree_fabric_add_sw(ftree_fabric_t * p_ftree,
1008 osm_switch_t * p_osm_sw)
1010 ftree_sw_t *p_sw = __osm_ftree_sw_create(p_ftree, p_osm_sw);
1012 CL_ASSERT(osm_node_get_type(p_osm_sw->p_node) == IB_NODE_TYPE_SWITCH);
1014 cl_qmap_insert(&p_ftree->sw_tbl, p_osm_sw->p_node->node_info.node_guid,
1017 /* track the max lid (in host order) that exists in the fabric */
1018 if (cl_ntoh16(p_sw->base_lid) > p_ftree->lft_max_lid_ho)
1019 p_ftree->lft_max_lid_ho = cl_ntoh16(p_sw->base_lid);
1022 /***************************************************/
1024 static void __osm_ftree_fabric_add_sw_by_tuple(IN ftree_fabric_t * p_ftree,
1025 IN ftree_sw_t * p_sw)
1027 CL_ASSERT(__osm_ftree_tuple_assigned(p_sw->tuple));
1029 cl_qmap_insert(&p_ftree->sw_by_tuple_tbl,
1030 __osm_ftree_tuple_to_key(p_sw->tuple),
1031 &__osm_ftree_sw_tbl_element_create(p_sw)->map_item);
1034 /***************************************************/
1036 static ftree_sw_t *__osm_ftree_fabric_get_sw_by_tuple(IN ftree_fabric_t *
1038 IN ftree_tuple_t tuple)
1040 ftree_sw_tbl_element_t *p_element;
1042 CL_ASSERT(__osm_ftree_tuple_assigned(tuple));
1044 __osm_ftree_tuple_to_key(tuple);
1047 (ftree_sw_tbl_element_t *) cl_qmap_get(&p_ftree->sw_by_tuple_tbl,
1048 __osm_ftree_tuple_to_key
1051 (ftree_sw_tbl_element_t *) cl_qmap_end(&p_ftree->sw_by_tuple_tbl))
1054 return p_element->p_sw;
1057 /***************************************************/
1059 static ftree_sw_t *__osm_ftree_fabric_get_sw_by_guid(IN ftree_fabric_t *
1060 p_ftree, IN uint64_t guid)
1063 p_sw = (ftree_sw_t *) cl_qmap_get(&p_ftree->sw_tbl, guid);
1064 if (p_sw == (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl))
1069 /***************************************************/
1071 static ftree_hca_t *__osm_ftree_fabric_get_hca_by_guid(IN ftree_fabric_t *
1076 p_hca = (ftree_hca_t *) cl_qmap_get(&p_ftree->hca_tbl, guid);
1077 if (p_hca == (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl))
1082 /***************************************************/
1084 static void __osm_ftree_fabric_dump(ftree_fabric_t * p_ftree)
1090 if (!osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
1093 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n"
1094 " |-------------------------------|\n"
1095 " |- Full fabric topology dump -|\n"
1096 " |-------------------------------|\n\n");
1098 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "-- CAs:\n");
1100 for (p_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
1101 p_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl);
1102 p_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item)) {
1103 __osm_ftree_hca_dump(p_ftree, p_hca);
1106 for (i = 0; i < p_ftree->max_switch_rank; i++) {
1107 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1108 "-- Rank %u switches\n", i);
1109 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1110 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1111 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1112 if (p_sw->rank == i)
1113 __osm_ftree_sw_dump(p_ftree, p_sw);
1117 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n"
1118 " |---------------------------------------|\n"
1119 " |- Full fabric topology dump completed -|\n"
1120 " |---------------------------------------|\n\n");
1121 } /* __osm_ftree_fabric_dump() */
1123 /***************************************************/
1125 static void __osm_ftree_fabric_dump_general_info(IN ftree_fabric_t * p_ftree)
1130 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1131 "General fabric topology info\n");
1132 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1133 "============================\n");
1135 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1136 " - FatTree rank (roots to leaf switches): %u\n",
1137 p_ftree->leaf_switch_rank + 1);
1138 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1139 " - FatTree max switch rank: %u\n", p_ftree->max_switch_rank);
1140 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1141 " - Fabric has %u CAs (%u of them CNs), %u switches\n",
1142 cl_qmap_count(&p_ftree->hca_tbl), p_ftree->cn_num,
1143 cl_qmap_count(&p_ftree->sw_tbl));
1145 CL_ASSERT(cl_qmap_count(&p_ftree->hca_tbl) >= p_ftree->cn_num);
1147 for (i = 0; i <= p_ftree->max_switch_rank; i++) {
1149 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1150 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1151 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1152 if (p_sw->rank == i)
1156 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1157 " - Fabric has %u switches at rank %u (roots)\n",
1159 else if (i == p_ftree->leaf_switch_rank)
1160 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1161 " - Fabric has %u switches at rank %u (%u of them leafs)\n",
1162 j, i, p_ftree->leaf_switches_num);
1164 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1165 " - Fabric has %u switches at rank %u\n", j,
1169 if (osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_VERBOSE)) {
1170 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1171 " - Root switches:\n");
1172 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1173 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1174 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1175 if (p_sw->rank == 0)
1176 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1177 " GUID: 0x%016" PRIx64
1178 ", LID: %u, Index %s\n",
1179 __osm_ftree_sw_get_guid_ho(p_sw),
1180 cl_ntoh16(p_sw->base_lid),
1181 __osm_ftree_tuple_to_str(p_sw->tuple));
1184 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1185 " - Leaf switches (sorted by index):\n");
1186 for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1187 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1188 " GUID: 0x%016" PRIx64
1189 ", LID: %u, Index %s\n",
1190 __osm_ftree_sw_get_guid_ho(p_ftree->
1192 cl_ntoh16(p_ftree->leaf_switches[i]->base_lid),
1193 __osm_ftree_tuple_to_str(p_ftree->
1198 } /* __osm_ftree_fabric_dump_general_info() */
1200 /***************************************************/
1202 static void __osm_ftree_fabric_dump_hca_ordering(IN ftree_fabric_t * p_ftree)
1206 ftree_port_group_t *p_group_on_sw;
1207 ftree_port_group_t *p_group_on_hca;
1210 unsigned printed_hcas_on_leaf;
1213 FILE *p_hca_ordering_file;
1214 char *filename = "opensm-ftree-ca-order.dump";
1216 snprintf(path, sizeof(path), "%s/%s",
1217 p_ftree->p_osm->subn.opt.dump_files_dir, filename);
1218 p_hca_ordering_file = fopen(path, "w");
1219 if (!p_hca_ordering_file) {
1220 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB01: "
1221 "cannot open file \'%s\': %s\n", filename,
1226 /* for each leaf switch (in indexing order) */
1227 for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1228 p_sw = p_ftree->leaf_switches[i];
1229 printed_hcas_on_leaf = 0;
1231 /* for each real CA (CNs and not) connected to this switch */
1232 for (j = 0; j < p_sw->down_port_groups_num; j++) {
1233 p_group_on_sw = p_sw->down_port_groups[j];
1235 if (p_group_on_sw->remote_node_type != IB_NODE_TYPE_CA)
1238 p_hca = p_group_on_sw->remote_hca_or_sw.p_hca;
1240 __osm_ftree_hca_get_port_group_by_remote_lid(p_hca,
1244 /* treat non-compute nodes as dummies */
1245 if (!p_group_on_hca->is_cn)
1248 fprintf(p_hca_ordering_file, "0x%04x\t%s\n",
1249 cl_ntoh16(p_group_on_hca->base_lid),
1250 p_hca->p_osm_node->print_desc);
1252 printed_hcas_on_leaf++;
1255 /* now print missing HCAs */
1257 j < (p_ftree->max_cn_per_leaf - printed_hcas_on_leaf); j++)
1258 fprintf(p_hca_ordering_file, "0xFFFF\tDUMMY\n");
1261 /* done going through all the leaf switches */
1263 fclose(p_hca_ordering_file);
1264 } /* __osm_ftree_fabric_dump_hca_ordering() */
1266 /***************************************************/
1269 __osm_ftree_fabric_assign_tuple(IN ftree_fabric_t * p_ftree,
1270 IN ftree_sw_t * p_sw,
1271 IN ftree_tuple_t new_tuple)
1273 memcpy(p_sw->tuple, new_tuple, FTREE_TUPLE_LEN);
1274 __osm_ftree_fabric_add_sw_by_tuple(p_ftree, p_sw);
1277 /***************************************************/
1279 static void __osm_ftree_fabric_assign_first_tuple(IN ftree_fabric_t * p_ftree,
1280 IN ftree_sw_t * p_sw)
1283 ftree_tuple_t new_tuple;
1285 __osm_ftree_tuple_init(new_tuple);
1286 new_tuple[0] = (uint8_t) p_sw->rank;
1287 for (i = 1; i <= p_sw->rank; i++)
1290 __osm_ftree_fabric_assign_tuple(p_ftree, p_sw, new_tuple);
1293 /***************************************************/
1296 __osm_ftree_fabric_get_new_tuple(IN ftree_fabric_t * p_ftree,
1297 OUT ftree_tuple_t new_tuple,
1298 IN ftree_tuple_t from_tuple,
1299 IN ftree_direction_t direction)
1302 ftree_tuple_t temp_tuple;
1306 __osm_ftree_tuple_init(new_tuple);
1307 memcpy(temp_tuple, from_tuple, FTREE_TUPLE_LEN);
1309 if (direction == FTREE_DIRECTION_DOWN) {
1311 var_index = from_tuple[0] + 1;
1314 var_index = from_tuple[0];
1317 for (i = 0; i < 0xFF; i++) {
1318 temp_tuple[var_index] = i;
1319 p_sw = __osm_ftree_fabric_get_sw_by_tuple(p_ftree, temp_tuple);
1320 if (p_sw == NULL) /* found free tuple */
1325 /* new tuple not found - there are more than 255 ports in one direction */
1328 memcpy(new_tuple, temp_tuple, FTREE_TUPLE_LEN);
1330 } /* __osm_ftree_fabric_get_new_tuple() */
1332 /***************************************************/
1334 static inline boolean_t __osm_ftree_fabric_roots_provided(IN ftree_fabric_t *
1337 return (p_ftree->p_osm->subn.opt.root_guid_file != NULL);
1340 /***************************************************/
1342 static inline boolean_t __osm_ftree_fabric_cns_provided(IN ftree_fabric_t *
1345 return (p_ftree->p_osm->subn.opt.cn_guid_file != NULL);
1348 /***************************************************/
1350 static int __osm_ftree_fabric_mark_leaf_switches(IN ftree_fabric_t * p_ftree)
1354 ftree_hca_t *p_next_hca;
1358 OSM_LOG_ENTER(&p_ftree->p_osm->log);
1360 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1361 "Marking leaf switches in fabric\n");
1363 /* Scan all the CAs, if they have CNs - find CN port and mark switch
1364 that is connected to this port as leaf switch.
1365 Also, ensure that this marked leaf has rank of p_ftree->leaf_switch_rank. */
1366 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
1367 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
1369 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
1373 for (i = 0; i < p_hca->up_port_groups_num; i++) {
1374 if (!p_hca->up_port_groups[i]->is_cn)
1377 /* In CAs, port group alway has one port, and since this
1378 port group is CN, we know that this port is compute node */
1379 CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type ==
1380 IB_NODE_TYPE_SWITCH);
1381 p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw;
1383 /* check if this switch was already processed */
1386 p_sw->is_leaf = TRUE;
1388 /* ensure that this leaf switch is at the correct tree level */
1389 if (p_sw->rank != p_ftree->leaf_switch_rank) {
1390 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1391 "ERR AB26: CN port 0x%" PRIx64
1392 " is connected to switch 0x%" PRIx64
1394 "while FatTree leaf rank is %u\n",
1395 cl_ntoh64(p_hca->up_port_groups[i]->
1397 __osm_ftree_sw_get_guid_ho(p_sw),
1398 p_sw->rank, p_ftree->leaf_switch_rank);
1407 OSM_LOG_EXIT(&p_ftree->p_osm->log);
1409 } /* __osm_ftree_fabric_mark_leaf_switches() */
1411 /***************************************************/
1413 static void __osm_ftree_fabric_make_indexing(IN ftree_fabric_t * p_ftree)
1415 ftree_sw_t *p_remote_sw;
1416 ftree_sw_t *p_sw = NULL;
1417 ftree_sw_t *p_next_sw;
1418 ftree_tuple_t new_tuple;
1421 ftree_sw_tbl_element_t *p_sw_tbl_element;
1423 OSM_LOG_ENTER(&p_ftree->p_osm->log);
1425 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1426 "Starting FatTree indexing\n");
1428 /* using the first leaf switch as a starting point for indexing algorithm. */
1429 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1430 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1434 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1437 CL_ASSERT(p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl));
1439 /* Assign the first tuple to the switch that is used as BFS starting point.
1440 The tuple will be as follows: [rank].0.0.0...
1441 This fuction also adds the switch it into the switch_by_tuple table. */
1442 __osm_ftree_fabric_assign_first_tuple(p_ftree, p_sw);
1444 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1445 "Indexing starting point:\n"
1446 " - Switch rank : %u\n"
1447 " - Switch index : %s\n"
1448 " - Node LID : %u\n"
1449 " - Node GUID : 0x%016"
1450 PRIx64 "\n", p_sw->rank, __osm_ftree_tuple_to_str(p_sw->tuple),
1451 cl_ntoh16(p_sw->base_lid), __osm_ftree_sw_get_guid_ho(p_sw));
1454 * Now run BFS and assign indexes to all switches
1455 * Pseudo code of the algorithm is as follows:
1457 * * Add first switch to BFS queue
1458 * * While (BFS queue not empty)
1459 * - Pop the switch from the head of the queue
1460 * - Scan all the downward and upward ports
1462 * + Get the remote switch
1463 * + Assign index to the remote switch
1464 * + Add remote switch to the BFS queue
1467 cl_list_init(&bfs_list, cl_qmap_count(&p_ftree->sw_tbl));
1468 cl_list_insert_tail(&bfs_list,
1469 &__osm_ftree_sw_tbl_element_create(p_sw)->map_item);
1471 while (!cl_is_list_empty(&bfs_list)) {
1473 (ftree_sw_tbl_element_t *) cl_list_remove_head(&bfs_list);
1474 p_sw = p_sw_tbl_element->p_sw;
1475 __osm_ftree_sw_tbl_element_destroy(p_sw_tbl_element);
1477 /* Discover all the nodes from ports that are pointing down */
1479 if (p_sw->rank >= p_ftree->leaf_switch_rank) {
1480 /* whether downward ports are pointing to CAs or switches,
1481 we don't assign indexes to switches that are located
1482 lower than leaf switches */
1484 /* This is not the leaf switch */
1485 for (i = 0; i < p_sw->down_port_groups_num; i++) {
1486 /* Work with port groups that are pointing to switches only.
1487 No need to assign indexing to HCAs */
1488 if (p_sw->down_port_groups[i]->
1489 remote_node_type != IB_NODE_TYPE_SWITCH)
1493 p_sw->down_port_groups[i]->remote_hca_or_sw.
1495 if (__osm_ftree_tuple_assigned
1496 (p_remote_sw->tuple)) {
1497 /* this switch has been already indexed */
1500 /* allocate new tuple */
1501 __osm_ftree_fabric_get_new_tuple(p_ftree,
1504 FTREE_DIRECTION_DOWN);
1505 /* Assign the new tuple to the remote switch.
1506 This fuction also adds the switch into the switch_by_tuple table. */
1507 __osm_ftree_fabric_assign_tuple(p_ftree,
1511 /* add the newly discovered switch to the BFS queue */
1512 cl_list_insert_tail(&bfs_list,
1513 &__osm_ftree_sw_tbl_element_create
1514 (p_remote_sw)->map_item);
1516 /* Done assigning indexes to all the remote switches
1517 that are pointed by the downgoing ports.
1518 Now sort port groups according to remote index. */
1519 qsort(p_sw->down_port_groups, /* array */
1520 p_sw->down_port_groups_num, /* number of elements */
1521 sizeof(ftree_port_group_t *), /* size of each element */
1522 __osm_ftree_compare_port_groups_by_remote_switch_index); /* comparator */
1525 /* Done indexing switches from ports that go down.
1526 Now do the same with ports that are pointing up. */
1528 if (p_sw->rank != 0) {
1529 /* This is not the root switch, which means that all the ports
1530 that are pointing up are taking us to another switches. */
1531 for (i = 0; i < p_sw->up_port_groups_num; i++) {
1533 p_sw->up_port_groups[i]->remote_hca_or_sw.
1535 if (__osm_ftree_tuple_assigned
1536 (p_remote_sw->tuple))
1538 /* allocate new tuple */
1539 __osm_ftree_fabric_get_new_tuple(p_ftree,
1542 FTREE_DIRECTION_UP);
1543 /* Assign the new tuple to the remote switch.
1544 This fuction also adds the switch to the
1545 switch_by_tuple table. */
1546 __osm_ftree_fabric_assign_tuple(p_ftree,
1549 /* add the newly discovered switch to the BFS queue */
1550 cl_list_insert_tail(&bfs_list,
1551 &__osm_ftree_sw_tbl_element_create
1552 (p_remote_sw)->map_item);
1554 /* Done assigning indexes to all the remote switches
1555 that are pointed by the upgoing ports.
1556 Now sort port groups according to remote index. */
1557 qsort(p_sw->up_port_groups, /* array */
1558 p_sw->up_port_groups_num, /* number of elements */
1559 sizeof(ftree_port_group_t *), /* size of each element */
1560 __osm_ftree_compare_port_groups_by_remote_switch_index); /* comparator */
1562 /* Done assigning indexes to all the switches that are directly connected
1563 to the current switch - go to the next switch in the BFS queue */
1565 cl_list_destroy(&bfs_list);
1567 OSM_LOG_EXIT(&p_ftree->p_osm->log);
1568 } /* __osm_ftree_fabric_make_indexing() */
1570 /***************************************************/
1572 static int __osm_ftree_fabric_create_leaf_switch_array(IN ftree_fabric_t *
1576 ftree_sw_t *p_next_sw;
1577 ftree_sw_t **all_switches_at_leaf_level;
1579 unsigned all_leaf_idx = 0;
1580 unsigned first_leaf_idx;
1581 unsigned last_leaf_idx;
1584 OSM_LOG_ENTER(&p_ftree->p_osm->log);
1586 /* create array of ALL the switches that have leaf rank */
1587 all_switches_at_leaf_level = (ftree_sw_t **)
1588 malloc(cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *));
1589 if (!all_switches_at_leaf_level) {
1590 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
1591 "Fat-tree routing: Memory allocation failed\n");
1595 memset(all_switches_at_leaf_level, 0,
1596 cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *));
1598 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1599 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1601 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1602 if (p_sw->rank == p_ftree->leaf_switch_rank) {
1603 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1604 "Adding switch 0x%" PRIx64
1605 " to full leaf switch array\n",
1606 __osm_ftree_sw_get_guid_ho(p_sw));
1607 all_switches_at_leaf_level[all_leaf_idx++] = p_sw;
1612 /* quick-sort array of leaf switches by index */
1613 qsort(all_switches_at_leaf_level, /* array */
1614 all_leaf_idx, /* number of elements */
1615 sizeof(ftree_sw_t *), /* size of each element */
1616 __osm_ftree_compare_switches_by_index); /* comparator */
1618 /* check the first and the last REAL leaf (the one
1619 that has CNs) in the array of all the leafs */
1621 first_leaf_idx = all_leaf_idx;
1623 for (i = 0; i < all_leaf_idx; i++) {
1624 if (all_switches_at_leaf_level[i]->is_leaf) {
1625 if (i < first_leaf_idx)
1630 CL_ASSERT(first_leaf_idx < last_leaf_idx);
1632 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1633 "Full leaf array info: first_leaf_idx = %u, last_leaf_idx = %u\n",
1634 first_leaf_idx, last_leaf_idx);
1636 /* Create array of REAL leaf switches, sorted by index.
1637 This array may contain switches at the same rank w/o CNs,
1638 in case this is the order of indexing. */
1639 p_ftree->leaf_switches_num = last_leaf_idx - first_leaf_idx + 1;
1640 p_ftree->leaf_switches = (ftree_sw_t **)
1641 malloc(p_ftree->leaf_switches_num * sizeof(ftree_sw_t *));
1642 if (!p_ftree->leaf_switches) {
1643 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
1644 "Fat-tree routing: Memory allocation failed\n");
1649 memcpy(p_ftree->leaf_switches,
1650 &(all_switches_at_leaf_level[first_leaf_idx]),
1651 p_ftree->leaf_switches_num * sizeof(ftree_sw_t *));
1653 free(all_switches_at_leaf_level);
1655 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1656 "Created array of %u leaf switches\n",
1657 p_ftree->leaf_switches_num);
1660 OSM_LOG_EXIT(&p_ftree->p_osm->log);
1662 } /* __osm_ftree_fabric_create_leaf_switch_array() */
1664 /***************************************************/
1666 static void __osm_ftree_fabric_set_max_cn_per_leaf(IN ftree_fabric_t * p_ftree)
1670 unsigned cns_on_this_leaf;
1672 ftree_port_group_t *p_group;
1674 for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1675 p_sw = p_ftree->leaf_switches[i];
1676 cns_on_this_leaf = 0;
1677 for (j = 0; j < p_sw->down_port_groups_num; j++) {
1678 p_group = p_sw->down_port_groups[j];
1679 if (p_group->remote_node_type != IB_NODE_TYPE_CA)
1682 p_group->remote_hca_or_sw.p_hca->cn_num;
1684 if (cns_on_this_leaf > p_ftree->max_cn_per_leaf)
1685 p_ftree->max_cn_per_leaf = cns_on_this_leaf;
1687 } /* __osm_ftree_fabric_set_max_cn_per_leaf() */
1689 /***************************************************/
1691 static boolean_t __osm_ftree_fabric_validate_topology(IN ftree_fabric_t *
1694 ftree_port_group_t *p_group;
1695 ftree_port_group_t *p_ref_group;
1697 ftree_sw_t *p_next_sw;
1698 ftree_sw_t **reference_sw_arr;
1699 uint16_t tree_rank = __osm_ftree_fabric_get_rank(p_ftree);
1700 boolean_t res = TRUE;
1703 OSM_LOG_ENTER(&p_ftree->p_osm->log);
1705 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1706 "Validating fabric topology\n");
1709 (ftree_sw_t **) malloc(tree_rank * sizeof(ftree_sw_t *));
1710 if (reference_sw_arr == NULL) {
1711 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
1712 "Fat-tree routing: Memory allocation failed\n");
1715 memset(reference_sw_arr, 0, tree_rank * sizeof(ftree_sw_t *));
1717 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1718 while (res && p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1720 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1722 if (!reference_sw_arr[p_sw->rank]) {
1723 /* This is the first switch in the current level that
1724 we're checking - use it as a reference */
1725 reference_sw_arr[p_sw->rank] = p_sw;
1727 /* compare this switch properties to the reference switch */
1729 if (reference_sw_arr[p_sw->rank]->up_port_groups_num !=
1730 p_sw->up_port_groups_num) {
1731 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1732 "ERR AB09: Different number of upward port groups on switches:\n"
1733 " GUID 0x%016" PRIx64
1734 ", LID %u, Index %s - %u groups\n"
1735 " GUID 0x%016" PRIx64
1736 ", LID %u, Index %s - %u groups\n",
1737 __osm_ftree_sw_get_guid_ho
1738 (reference_sw_arr[p_sw->rank]),
1739 cl_ntoh16(reference_sw_arr[p_sw->rank]->
1741 __osm_ftree_tuple_to_str
1742 (reference_sw_arr[p_sw->rank]->tuple),
1743 reference_sw_arr[p_sw->rank]->
1745 __osm_ftree_sw_get_guid_ho(p_sw),
1746 cl_ntoh16(p_sw->base_lid),
1747 __osm_ftree_tuple_to_str(p_sw->tuple),
1748 p_sw->up_port_groups_num);
1753 if (p_sw->rank != (tree_rank - 1) &&
1754 reference_sw_arr[p_sw->rank]->
1755 down_port_groups_num !=
1756 p_sw->down_port_groups_num) {
1757 /* we're allowing some hca's to be missing */
1758 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1759 "ERR AB0A: Different number of downward port groups on switches:\n"
1760 " GUID 0x%016" PRIx64
1761 ", LID %u, Index %s - %u port groups\n"
1762 " GUID 0x%016" PRIx64
1763 ", LID %u, Index %s - %u port groups\n",
1764 __osm_ftree_sw_get_guid_ho
1765 (reference_sw_arr[p_sw->rank]),
1766 cl_ntoh16(reference_sw_arr[p_sw->rank]->
1768 __osm_ftree_tuple_to_str
1769 (reference_sw_arr[p_sw->rank]->tuple),
1770 reference_sw_arr[p_sw->rank]->
1771 down_port_groups_num,
1772 __osm_ftree_sw_get_guid_ho(p_sw),
1773 cl_ntoh16(p_sw->base_lid),
1774 __osm_ftree_tuple_to_str(p_sw->tuple),
1775 p_sw->down_port_groups_num);
1780 if (reference_sw_arr[p_sw->rank]->up_port_groups_num !=
1783 reference_sw_arr[p_sw->rank]->
1785 for (i = 0; i < p_sw->up_port_groups_num; i++) {
1786 p_group = p_sw->up_port_groups[i];
1787 if (cl_ptr_vector_get_size
1788 (&p_ref_group->ports) !=
1789 cl_ptr_vector_get_size(&p_group->
1791 OSM_LOG(&p_ftree->p_osm->log,
1793 "ERR AB0B: Different number of ports in an upward port group on switches:\n"
1796 ", LID %u, Index %s - %u ports\n"
1799 ", LID %u, Index %s - %u ports\n",
1800 __osm_ftree_sw_get_guid_ho
1807 __osm_ftree_tuple_to_str
1809 [p_sw->rank]->tuple),
1810 cl_ptr_vector_get_size
1811 (&p_ref_group->ports),
1812 __osm_ftree_sw_get_guid_ho
1816 __osm_ftree_tuple_to_str
1818 cl_ptr_vector_get_size
1825 if (reference_sw_arr[p_sw->rank]->
1826 down_port_groups_num != 0
1827 && p_sw->rank != (tree_rank - 1)) {
1828 /* we're allowing some hca's to be missing */
1830 reference_sw_arr[p_sw->rank]->
1831 down_port_groups[0];
1832 for (i = 0; i < p_sw->down_port_groups_num; i++) {
1833 p_group = p_sw->down_port_groups[0];
1834 if (cl_ptr_vector_get_size
1835 (&p_ref_group->ports) !=
1836 cl_ptr_vector_get_size(&p_group->
1838 OSM_LOG(&p_ftree->p_osm->log,
1840 "ERR AB0C: Different number of ports in an downward port group on switches:\n"
1843 ", LID %u, Index %s - %u ports\n"
1846 ", LID %u, Index %s - %u ports\n",
1847 __osm_ftree_sw_get_guid_ho
1854 __osm_ftree_tuple_to_str
1856 [p_sw->rank]->tuple),
1857 cl_ptr_vector_get_size
1858 (&p_ref_group->ports),
1859 __osm_ftree_sw_get_guid_ho
1863 __osm_ftree_tuple_to_str
1865 cl_ptr_vector_get_size
1873 } /* end of while */
1876 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1877 "Fabric topology has been identified as FatTree\n");
1879 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1880 "ERR AB0D: Fabric topology hasn't been identified as FatTree\n");
1882 free(reference_sw_arr);
1883 OSM_LOG_EXIT(&p_ftree->p_osm->log);
1885 } /* __osm_ftree_fabric_validate_topology() */
1887 /***************************************************
1888 ***************************************************/
1890 static void __osm_ftree_set_sw_fwd_table(IN cl_map_item_t * const p_map_item,
1893 ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item;
1894 ftree_fabric_t *p_ftree = (ftree_fabric_t *) context;
1896 p_sw->p_osm_sw->max_lid_ho = p_ftree->lft_max_lid_ho;
1897 osm_ucast_mgr_set_fwd_table(&p_ftree->p_osm->sm.ucast_mgr,
1901 /***************************************************
1902 ***************************************************/
1905 * Function: assign-up-going-port-by-descending-down
1906 * Given : a switch and a LID
1908 * foreach down-going-port-group (in indexing order)
1909 * skip this group if the LFT(LID) port is part of this group
1910 * find the least loaded port of the group (scan in indexing order)
1911 * r-port is the remote port connected to it
1912 * assign the remote switch node LFT(LID) to r-port
1913 * increase r-port usage counter
1914 * assign-up-going-port-by-descending-down to r-port node (recursion)
1918 __osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
1919 IN ftree_sw_t * p_sw,
1920 IN ftree_sw_t * p_prev_sw,
1921 IN ib_net16_t target_lid,
1922 IN uint8_t target_rank,
1923 IN boolean_t is_real_lid,
1924 IN boolean_t is_main_path,
1925 IN uint8_t highest_rank_in_route)
1927 ftree_sw_t *p_remote_sw;
1929 ftree_port_group_t *p_group;
1930 ftree_port_t *p_port;
1931 ftree_port_t *p_min_port;
1936 /* we shouldn't enter here if both real_lid and main_path are false */
1937 CL_ASSERT(is_real_lid || is_main_path);
1939 /* if there is no down-going ports */
1940 if (p_sw->down_port_groups_num == 0)
1943 /* promote the index that indicates which group should we
1944 start with when going through all the downgoing groups */
1945 p_sw->down_port_groups_idx =
1946 (p_sw->down_port_groups_idx + 1) % p_sw->down_port_groups_num;
1948 /* foreach down-going port group (in indexing order) */
1949 i = p_sw->down_port_groups_idx;
1950 for (k = 0; k < p_sw->down_port_groups_num; k++) {
1952 p_group = p_sw->down_port_groups[i];
1953 i = (i + 1) % p_sw->down_port_groups_num;
1955 /* Skip this port group unless it points to a switch */
1956 if (p_group->remote_node_type != IB_NODE_TYPE_SWITCH)
1960 && (p_group->remote_base_lid == p_prev_sw->base_lid)) {
1961 /* This port group has a port that was used when we entered this switch,
1962 which means that the current group points to the switch where we were
1963 at the previous step of the algorithm (before going up).
1964 Skipping this group. */
1968 /* find the least loaded port of the group (in indexing order) */
1970 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
1971 /* ToDo: no need to select a least loaded port for non-main path.
1972 Think about optimization. */
1973 for (j = 0; j < ports_num; j++) {
1974 cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
1976 /* first port that we're checking - set as port with the lowest load */
1977 p_min_port = p_port;
1978 } else if (p_port->counter_up < p_min_port->counter_up) {
1979 /* this port is less loaded - use it as min */
1980 p_min_port = p_port;
1983 /* At this point we have selected a port in this group with the
1984 lowest load of upgoing routes.
1985 Set on the remote switch how to get to the target_lid -
1986 set LFT(target_lid) on the remote switch to the remote port */
1987 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
1989 if (osm_switch_get_least_hops(p_remote_sw->p_osm_sw,
1990 cl_ntoh16(target_lid)) !=
1992 /* Loop in the fabric - we already routed the remote switch
1993 on our way UP, and now we see it again on our way DOWN */
1994 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1995 "Loop of lenght %d in the fabric:\n "
1996 "Switch %s (LID %u) closes loop through switch %s (LID %u)\n",
1997 (p_remote_sw->rank - highest_rank_in_route) * 2,
1998 __osm_ftree_tuple_to_str(p_remote_sw->tuple),
1999 cl_ntoh16(p_group->base_lid),
2000 __osm_ftree_tuple_to_str(p_sw->tuple),
2001 cl_ntoh16(p_group->remote_base_lid));
2005 /* Four possible cases:
2007 * 1. is_real_lid == TRUE && is_main_path == TRUE:
2008 * - going DOWN(TRUE,TRUE) through ALL the groups
2009 * + promoting port counter
2010 * + setting path in remote switch fwd tbl
2011 * + setting hops in remote switch on all the ports of each group
2013 * 2. is_real_lid == TRUE && is_main_path == FALSE:
2014 * - going DOWN(TRUE,FALSE) through ALL the groups but only if
2015 * the remote (lower) switch hasn't been already configured
2016 * for this target LID
2017 * + NOT promoting port counter
2018 * + setting path in remote switch fwd tbl if it hasn't been set yet
2019 * + setting hops in remote switch on all the ports of each group
2020 * if it hasn't been set yet
2022 * 3. is_real_lid == FALSE && is_main_path == TRUE:
2023 * - going DOWN(FALSE,TRUE) through ALL the groups
2024 * + promoting port counter
2025 * + NOT setting path in remote switch fwd tbl
2026 * + NOT setting hops in remote switch
2028 * 4. is_real_lid == FALSE && is_main_path == FALSE:
2029 * - illegal state - we shouldn't get here
2032 /* second case: skip the port group if the remote (lower)
2033 switch has been already configured for this target LID */
2034 if (is_real_lid && !is_main_path &&
2035 p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] != OSM_NO_PATH)
2038 /* setting fwd tbl port only if this is real LID */
2040 p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] =
2041 p_min_port->remote_port_num;
2042 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2043 "Switch %s: set path to CA LID %u through port %u\n",
2044 __osm_ftree_tuple_to_str(p_remote_sw->tuple),
2045 cl_ntoh16(target_lid),
2046 p_min_port->remote_port_num);
2048 /* On the remote switch that is pointed by the p_group,
2049 set hops for ALL the ports in the remote group. */
2051 for (j = 0; j < ports_num; j++) {
2052 cl_ptr_vector_at(&p_group->ports, j,
2055 __osm_ftree_sw_set_hops(p_remote_sw,
2056 cl_ntoh16(target_lid),
2057 p_port->remote_port_num,
2059 highest_rank_in_route)
2060 + (p_remote_sw->rank -
2061 highest_rank_in_route)));
2066 /* The number of upgoing routes is tracked in the
2067 p_port->counter_up counter of the port that belongs to
2068 the upper side of the link (on switch with lower rank).
2069 Counter is promoted only if we're routing LID on the main
2070 path (whether it's a real LID or a dummy one). */
2072 p_min_port->counter_up++;
2075 Assign upgoing ports by stepping down, starting on REMOTE switch */
2076 __osm_ftree_fabric_route_upgoing_by_going_down(p_ftree, p_remote_sw, /* remote switch - used as a route-upgoing alg. start point */
2077 NULL, /* prev. position - NULL to mark that we went down and not up */
2078 target_lid, /* LID that we're routing to */
2079 target_rank, /* rank of the LID that we're routing to */
2080 is_real_lid, /* whether the target LID is real or dummy */
2081 is_main_path, /* whether this is path to HCA that should by tracked by counters */
2082 highest_rank_in_route); /* highest visited point in the tree before going down */
2084 /* done scanning all the down-going port groups */
2086 } /* __osm_ftree_fabric_route_upgoing_by_going_down() */
2088 /***************************************************/
2091 * Function: assign-down-going-port-by-ascending-up
2092 * Given : a switch and a LID
2094 * find the least loaded port of all the upgoing groups (scan in indexing order)
2095 * assign the LFT(LID) of remote switch to that port
2096 * track that port usage
2097 * assign-up-going-port-by-descending-down on CURRENT switch
2098 * assign-down-going-port-by-ascending-up on REMOTE switch (recursion)
2102 __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
2103 IN ftree_sw_t * p_sw,
2104 IN ftree_sw_t * p_prev_sw,
2105 IN ib_net16_t target_lid,
2106 IN uint8_t target_rank,
2107 IN boolean_t is_real_lid,
2108 IN boolean_t is_main_path)
2110 ftree_sw_t *p_remote_sw;
2112 ftree_port_group_t *p_group;
2113 ftree_port_t *p_port;
2114 ftree_port_group_t *p_min_group;
2115 ftree_port_t *p_min_port;
2119 /* we shouldn't enter here if both real_lid and main_path are false */
2120 CL_ASSERT(is_real_lid || is_main_path);
2122 /* Assign upgoing ports by stepping down, starting on THIS switch */
2123 __osm_ftree_fabric_route_upgoing_by_going_down(p_ftree, p_sw, /* local switch - used as a route-upgoing alg. start point */
2124 p_prev_sw, /* switch that we went up from (NULL means that we went down) */
2125 target_lid, /* LID that we're routing to */
2126 target_rank, /* rank of the LID that we're routing to */
2127 is_real_lid, /* whether this target LID is real or dummy */
2128 is_main_path, /* whether this path to HCA should by tracked by counters */
2129 p_sw->rank); /* the highest visited point in the tree before going down */
2131 /* recursion stop condition - if it's a root switch, */
2132 if (p_sw->rank == 0)
2135 /* Find the least loaded upgoing port group */
2137 for (i = 0; i < p_sw->up_port_groups_num; i++) {
2138 p_group = p_sw->up_port_groups[i];
2140 /* first group that we're checking - use
2141 it as a group with the lowest load */
2142 p_min_group = p_group;
2143 } else if (p_group->counter_down < p_min_group->counter_down) {
2144 /* this group is less loaded - use it as min */
2145 p_min_group = p_group;
2149 /* Find the least loaded upgoing port in the selected group */
2151 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_min_group->ports);
2152 for (j = 0; j < ports_num; j++) {
2153 cl_ptr_vector_at(&p_min_group->ports, j, (void *)&p_port);
2155 /* first port that we're checking - use
2156 it as a port with the lowest load */
2157 p_min_port = p_port;
2158 } else if (p_port->counter_down < p_min_port->counter_down) {
2159 /* this port is less loaded - use it as min */
2160 p_min_port = p_port;
2164 /* At this point we have selected a group and port with the
2165 lowest load of downgoing routes.
2166 Set on the remote switch how to get to the target_lid -
2167 set LFT(target_lid) on the remote switch to the remote port */
2168 p_remote_sw = p_min_group->remote_hca_or_sw.p_sw;
2170 /* Four possible cases:
2172 * 1. is_real_lid == TRUE && is_main_path == TRUE:
2173 * - going UP(TRUE,TRUE) on selected min_group and min_port
2174 * + promoting port counter
2175 * + setting path in remote switch fwd tbl
2176 * + setting hops in remote switch on all the ports of selected group
2177 * - going UP(TRUE,FALSE) on rest of the groups, each time on port 0
2178 * + NOT promoting port counter
2179 * + setting path in remote switch fwd tbl if it hasn't been set yet
2180 * + setting hops in remote switch on all the ports of each group
2181 * if it hasn't been set yet
2183 * 2. is_real_lid == TRUE && is_main_path == FALSE:
2184 * - going UP(TRUE,FALSE) on ALL the groups, each time on port 0,
2185 * but only if the remote (upper) switch hasn't been already
2186 * configured for this target LID
2187 * + NOT promoting port counter
2188 * + setting path in remote switch fwd tbl if it hasn't been set yet
2189 * + setting hops in remote switch on all the ports of each group
2190 * if it hasn't been set yet
2192 * 3. is_real_lid == FALSE && is_main_path == TRUE:
2193 * - going UP(FALSE,TRUE) ONLY on selected min_group and min_port
2194 * + promoting port counter
2195 * + NOT setting path in remote switch fwd tbl
2196 * + NOT setting hops in remote switch
2198 * 4. is_real_lid == FALSE && is_main_path == FALSE:
2199 * - illegal state - we shouldn't get here
2202 /* covering first half of case 1, and case 3 */
2204 if (p_sw->is_leaf) {
2205 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2206 " - Routing MAIN path for %s CA LID %u: %s --> %s\n",
2207 (is_real_lid) ? "real" : "DUMMY",
2208 cl_ntoh16(target_lid),
2209 __osm_ftree_tuple_to_str(p_sw->tuple),
2210 __osm_ftree_tuple_to_str(p_remote_sw->tuple));
2212 /* The number of downgoing routes is tracked in the
2213 p_group->counter_down p_port->counter_down counters of the
2214 group and port that belong to the lower side of the link
2215 (on switch with higher rank) */
2216 p_min_group->counter_down++;
2217 p_min_port->counter_down++;
2219 p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] =
2220 p_min_port->remote_port_num;
2221 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2222 "Switch %s: set path to CA LID %u through port %u\n",
2223 __osm_ftree_tuple_to_str(p_remote_sw->tuple),
2224 cl_ntoh16(target_lid),
2225 p_min_port->remote_port_num);
2227 /* On the remote switch that is pointed by the min_group,
2228 set hops for ALL the ports in the remote group. */
2231 (uint16_t) cl_ptr_vector_get_size(&p_min_group->
2233 for (j = 0; j < ports_num; j++) {
2234 cl_ptr_vector_at(&p_min_group->ports, j,
2236 __osm_ftree_sw_set_hops(p_remote_sw,
2237 cl_ntoh16(target_lid),
2238 p_port->remote_port_num,
2245 Assign downgoing ports by stepping up, starting on REMOTE switch. */
2246 __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
2247 p_sw, /* this switch - prev. position switch for the function */
2248 target_lid, /* LID that we're routing to */
2249 target_rank, /* rank of the LID that we're routing to */
2250 is_real_lid, /* whether this target LID is real or dummy */
2251 is_main_path); /* whether this is path to HCA that should by tracked by counters */
2254 /* we're done for the third case */
2258 /* What's left to do at this point:
2260 * 1. is_real_lid == TRUE && is_main_path == TRUE:
2261 * - going UP(TRUE,FALSE) on rest of the groups, each time on port 0,
2262 * but only if the remote (upper) switch hasn't been already
2263 * configured for this target LID
2264 * + NOT promoting port counter
2265 * + setting path in remote switch fwd tbl if it hasn't been set yet
2266 * + setting hops in remote switch on all the ports of each group
2267 * if it hasn't been set yet
2269 * 2. is_real_lid == TRUE && is_main_path == FALSE:
2270 * - going UP(TRUE,FALSE) on ALL the groups, each time on port 0,
2271 * but only if the remote (upper) switch hasn't been already
2272 * configured for this target LID
2273 * + NOT promoting port counter
2274 * + setting path in remote switch fwd tbl if it hasn't been set yet
2275 * + setting hops in remote switch on all the ports of each group
2276 * if it hasn't been set yet
2278 * These two rules can be rephrased this way:
2279 * - foreach UP port group
2280 * + if remote switch has been set with the target LID
2281 * - skip this port group
2284 * - do NOT promote port counter
2285 * - set path in remote switch fwd tbl
2286 * - set hops in remote switch on all the ports of this group
2287 * - go UP(TRUE,FALSE) to the remote switch
2290 for (i = 0; i < p_sw->up_port_groups_num; i++) {
2291 p_group = p_sw->up_port_groups[i];
2292 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2294 /* skip if target lid has been already set on remote switch fwd tbl */
2295 if (p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] != OSM_NO_PATH)
2298 if (p_sw->is_leaf) {
2299 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2300 " - Routing SECONDARY path for LID %u: %s --> %s\n",
2301 cl_ntoh16(target_lid),
2302 __osm_ftree_tuple_to_str(p_sw->tuple),
2303 __osm_ftree_tuple_to_str(p_remote_sw->tuple));
2306 /* Routing REAL lids on SECONDARY path means routing
2307 switch-to-switch or switch-to-CA paths.
2308 We can safely assume that switch will initiate very
2309 few traffic, so there's no point waisting runtime on
2310 trying to balance these routes - always pick port 0. */
2312 cl_ptr_vector_at(&p_group->ports, 0, (void *)&p_port);
2313 p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] =
2314 p_port->remote_port_num;
2316 /* On the remote switch that is pointed by the p_group,
2317 set hops for ALL the ports in the remote group. */
2319 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
2320 for (j = 0; j < ports_num; j++) {
2321 cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
2323 __osm_ftree_sw_set_hops(p_remote_sw,
2324 cl_ntoh16(target_lid),
2325 p_port->remote_port_num,
2331 Assign downgoing ports by stepping up, starting on REMOTE switch. */
2332 __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
2333 p_sw, /* this switch - prev. position switch for the function */
2334 target_lid, /* LID that we're routing to */
2335 target_rank, /* rank of the LID that we're routing to */
2336 TRUE, /* whether the target LID is real or dummy */
2337 FALSE); /* whether this is path to HCA that should by tracked by counters */
2340 } /* ftree_fabric_route_downgoing_by_going_up() */
2342 /***************************************************/
2346 * foreach leaf switch (in indexing order)
2347 * for each compute node (in indexing order)
2348 * obtain the LID of the compute node
2349 * set local LFT(LID) of the port connecting to compute node
2350 * call assign-down-going-port-by-ascending-up(TRUE,TRUE) on CURRENT switch
2351 * for each MISSING compute node
2352 * call assign-down-going-port-by-ascending-up(FALSE,TRUE) on CURRENT switch
2355 static void __osm_ftree_fabric_route_to_cns(IN ftree_fabric_t * p_ftree)
2359 ftree_port_group_t *p_leaf_port_group;
2360 ftree_port_group_t *p_hca_port_group;
2361 ftree_port_t *p_port;
2365 unsigned routed_targets_on_leaf;
2367 OSM_LOG_ENTER(&p_ftree->p_osm->log);
2369 /* for each leaf switch (in indexing order) */
2370 for (i = 0; i < p_ftree->leaf_switches_num; i++) {
2371 p_sw = p_ftree->leaf_switches[i];
2372 routed_targets_on_leaf = 0;
2374 /* for each HCA connected to this switch */
2375 for (j = 0; j < p_sw->down_port_groups_num; j++) {
2376 p_leaf_port_group = p_sw->down_port_groups[j];
2378 /* work with this port group only if the remote node is CA */
2379 if (p_leaf_port_group->remote_node_type !=
2383 p_hca = p_leaf_port_group->remote_hca_or_sw.p_hca;
2385 /* work with this port group only if remote HCA has CNs */
2390 __osm_ftree_hca_get_port_group_by_remote_lid(p_hca,
2393 CL_ASSERT(p_hca_port_group);
2395 /* work with this port group only if remote port is CN */
2396 if (!p_hca_port_group->is_cn)
2399 /* obtain the LID of HCA port */
2400 hca_lid = p_leaf_port_group->remote_base_lid;
2402 /* set local LFT(LID) to the port that is connected to HCA */
2403 cl_ptr_vector_at(&p_leaf_port_group->ports, 0,
2405 p_sw->p_osm_sw->new_lft[cl_ntoh16(hca_lid)] = p_port->port_num;
2407 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2408 "Switch %s: set path to CN LID %u through port %u\n",
2409 __osm_ftree_tuple_to_str(p_sw->tuple),
2410 cl_ntoh16(hca_lid), p_port->port_num);
2412 /* set local min hop table(LID) to route to the CA */
2413 __osm_ftree_sw_set_hops(p_sw,
2415 p_port->port_num, 1);
2417 /* Assign downgoing ports by stepping up.
2418 Since we're routing here only CNs, we're routing it as REAL
2419 LID and updating fat-tree balancing counters. */
2420 __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */
2421 NULL, /* prev. position switch */
2422 hca_lid, /* LID that we're routing to */
2423 p_sw->rank + 1, /* rank of the LID that we're routing to */
2424 TRUE, /* whether this HCA LID is real or dummy */
2425 TRUE); /* whether this path to HCA should by tracked by counters */
2427 /* count how many real targets have been routed from this leaf switch */
2428 routed_targets_on_leaf++;
2431 /* We're done with the real targets (all CNs) of this leaf switch.
2432 Now route the dummy HCAs that are missing or that are non-CNs.
2433 When routing to dummy HCAs we don't fill lid matrices. */
2435 if (p_ftree->max_cn_per_leaf > routed_targets_on_leaf) {
2436 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2437 "Routing %u dummy CAs\n",
2438 p_ftree->max_cn_per_leaf -
2439 p_sw->down_port_groups_num);
2442 (p_ftree->max_cn_per_leaf -
2443 routed_targets_on_leaf); j++) {
2444 /* assign downgoing ports by stepping up */
2445 __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */
2446 NULL, /* prev. position switch */
2447 0, /* LID that we're routing to - ignored for dummy HCA */
2448 0, /* rank of the LID that we're routing to - ignored for dummy HCA */
2449 FALSE, /* whether this HCA LID is real or dummy */
2450 TRUE); /* whether this path to HCA should by tracked by counters */
2454 /* done going through all the leaf switches */
2455 OSM_LOG_EXIT(&p_ftree->p_osm->log);
2456 } /* __osm_ftree_fabric_route_to_cns() */
2458 /***************************************************/
2462 * foreach HCA non-CN port in fabric
2463 * obtain the LID of the HCA port
2464 * get switch that is connected to this HCA port
2465 * set switch LFT(LID) to the port connecting to compute node
2466 * call assign-down-going-port-by-ascending-up(TRUE,FALSE) on CURRENT switch
2468 * Routing to these HCAs is routing a REAL hca lid on SECONDARY path.
2469 * However, we do want to allow load-leveling of the traffic to the non-CNs,
2470 * because such nodes may include IO nodes with heavy usage
2471 * - we should set fwd tables
2472 * - we should update port counters
2473 * Routing to non-CNs is done after routing to CNs, so updated port
2474 * counters will not affect CN-to-CN routing.
2477 static void __osm_ftree_fabric_route_to_non_cns(IN ftree_fabric_t * p_ftree)
2481 ftree_hca_t *p_next_hca;
2482 ftree_port_t *p_hca_port;
2483 ftree_port_group_t *p_hca_port_group;
2485 unsigned port_num_on_switch;
2488 OSM_LOG_ENTER(&p_ftree->p_osm->log);
2490 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
2491 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
2493 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
2495 for (i = 0; i < p_hca->up_port_groups_num; i++) {
2496 p_hca_port_group = p_hca->up_port_groups[i];
2498 /* skip this port if it's CN, in which case it has been already routed */
2499 if (p_hca_port_group->is_cn)
2502 /* skip this port if it is not connected to switch */
2503 if (p_hca_port_group->remote_node_type !=
2504 IB_NODE_TYPE_SWITCH)
2507 p_sw = p_hca_port_group->remote_hca_or_sw.p_sw;
2508 hca_lid = p_hca_port_group->base_lid;
2510 /* set switches LFT(LID) to the port that is connected to HCA */
2511 cl_ptr_vector_at(&p_hca_port_group->ports, 0,
2512 (void *)&p_hca_port);
2513 port_num_on_switch = p_hca_port->remote_port_num;
2514 p_sw->p_osm_sw->new_lft[cl_ntoh16(hca_lid)] = port_num_on_switch;
2516 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2517 "Switch %s: set path to non-CN HCA LID %u through port %u\n",
2518 __osm_ftree_tuple_to_str(p_sw->tuple),
2519 cl_ntoh16(hca_lid), port_num_on_switch);
2521 /* set local min hop table(LID) to route to the CA */
2522 __osm_ftree_sw_set_hops(p_sw, cl_ntoh16(hca_lid),
2523 port_num_on_switch, /* port num */
2526 /* Assign downgoing ports by stepping up.
2527 We're routing REAL targets. They are not CNs and not included
2528 in the leafs array, but we treat them as MAIN path to allow load
2529 leveling, which means that the counters will be updated. */
2530 __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */
2531 NULL, /* prev. position switch */
2532 hca_lid, /* LID that we're routing to */
2533 p_sw->rank + 1, /* rank of the LID that we're routing to */
2534 TRUE, /* whether this HCA LID is real or dummy */
2535 TRUE); /* whether this path to HCA should by tracked by counters */
2537 /* done with all the port groups of this HCA - go to next HCA */
2540 OSM_LOG_EXIT(&p_ftree->p_osm->log);
2541 } /* __osm_ftree_fabric_route_to_non_cns() */
2543 /***************************************************/
2547 * foreach switch in fabric
2549 * set local LFT(LID) to port 0
2550 * call assign-down-going-port-by-ascending-up(TRUE,FALSE) on CURRENT switch
2552 * Routing to switch is similar to routing a REAL hca lid on SECONDARY path:
2553 * - we should set fwd tables
2554 * - we should NOT update port counters
2557 static void __osm_ftree_fabric_route_to_switches(IN ftree_fabric_t * p_ftree)
2560 ftree_sw_t *p_next_sw;
2562 OSM_LOG_ENTER(&p_ftree->p_osm->log);
2564 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
2565 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
2567 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
2569 /* set local LFT(LID) to 0 (route to itself) */
2570 p_sw->p_osm_sw->new_lft[cl_ntoh16(p_sw->base_lid)] = 0;
2572 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2573 "Switch %s (LID %u): routing switch-to-switch paths\n",
2574 __osm_ftree_tuple_to_str(p_sw->tuple),
2575 cl_ntoh16(p_sw->base_lid));
2577 /* set min hop table of the switch to itself */
2578 __osm_ftree_sw_set_hops(p_sw, cl_ntoh16(p_sw->base_lid),
2582 __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */
2583 NULL, /* prev. position switch */
2584 p_sw->base_lid, /* LID that we're routing to */
2585 p_sw->rank, /* rank of the LID that we're routing to */
2586 TRUE, /* whether the target LID is a real or dummy */
2587 FALSE); /* whether this path should by tracked by counters */
2590 OSM_LOG_EXIT(&p_ftree->p_osm->log);
2591 } /* __osm_ftree_fabric_route_to_switches() */
2593 /***************************************************
2594 ***************************************************/
2596 static int __osm_ftree_fabric_populate_nodes(IN ftree_fabric_t * p_ftree)
2598 osm_node_t *p_osm_node;
2599 osm_node_t *p_next_osm_node;
2601 OSM_LOG_ENTER(&p_ftree->p_osm->log);
2604 (osm_node_t *) cl_qmap_head(&p_ftree->p_osm->subn.node_guid_tbl);
2605 while (p_next_osm_node !=
2606 (osm_node_t *) cl_qmap_end(&p_ftree->p_osm->subn.
2608 p_osm_node = p_next_osm_node;
2610 (osm_node_t *) cl_qmap_next(&p_osm_node->map_item);
2611 switch (osm_node_get_type(p_osm_node)) {
2612 case IB_NODE_TYPE_CA:
2613 __osm_ftree_fabric_add_hca(p_ftree, p_osm_node);
2615 case IB_NODE_TYPE_ROUTER:
2617 case IB_NODE_TYPE_SWITCH:
2618 __osm_ftree_fabric_add_sw(p_ftree, p_osm_node->sw);
2621 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB0E: "
2622 "Node GUID 0x%016" PRIx64
2623 " - Unknown node type: %s\n",
2624 cl_ntoh64(osm_node_get_node_guid(p_osm_node)),
2625 ib_get_node_type_str(osm_node_get_type
2627 OSM_LOG_EXIT(&p_ftree->p_osm->log);
2632 OSM_LOG_EXIT(&p_ftree->p_osm->log);
2634 } /* __osm_ftree_fabric_populate_nodes() */
2636 /***************************************************
2637 ***************************************************/
2639 static boolean_t __osm_ftree_sw_update_rank(IN ftree_sw_t * p_sw,
2640 IN uint32_t new_rank)
2642 if (__osm_ftree_sw_ranked(p_sw) && p_sw->rank <= new_rank)
2644 p_sw->rank = new_rank;
2649 /***************************************************/
2652 __osm_ftree_rank_switches_from_leafs(IN ftree_fabric_t * p_ftree,
2653 IN cl_list_t * p_ranking_bfs_list)
2656 ftree_sw_t *p_remote_sw;
2658 osm_node_t *p_remote_node;
2659 osm_physp_t *p_osm_port;
2661 unsigned max_rank = 0;
2663 while (!cl_is_list_empty(p_ranking_bfs_list)) {
2664 p_sw = (ftree_sw_t *) cl_list_remove_head(p_ranking_bfs_list);
2665 p_node = p_sw->p_osm_sw->p_node;
2667 /* note: skipping port 0 on switches */
2668 for (i = 1; i < osm_node_get_num_physp(p_node); i++) {
2669 p_osm_port = osm_node_get_physp_ptr(p_node, i);
2670 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
2674 osm_node_get_remote_node(p_node, i, NULL);
2677 if (osm_node_get_type(p_remote_node) !=
2678 IB_NODE_TYPE_SWITCH)
2681 p_remote_sw = __osm_ftree_fabric_get_sw_by_guid(p_ftree,
2682 osm_node_get_node_guid
2685 /* remote node is not a switch */
2689 /* if needed, rank the remote switch and add it to the BFS list */
2690 if (__osm_ftree_sw_update_rank
2691 (p_remote_sw, p_sw->rank + 1)) {
2692 max_rank = p_remote_sw->rank;
2693 cl_list_insert_tail(p_ranking_bfs_list,
2699 /* set FatTree maximal switch rank */
2700 p_ftree->max_switch_rank = max_rank;
2702 } /* __osm_ftree_rank_switches_from_leafs() */
2704 /***************************************************/
2707 __osm_ftree_rank_leaf_switches(IN ftree_fabric_t * p_ftree,
2708 IN ftree_hca_t * p_hca,
2709 IN cl_list_t * p_ranking_bfs_list)
2712 osm_node_t *p_osm_node = p_hca->p_osm_node;
2713 osm_node_t *p_remote_osm_node;
2714 osm_physp_t *p_osm_port;
2715 static uint8_t i = 0;
2718 OSM_LOG_ENTER(&p_ftree->p_osm->log);
2720 for (i = 0; i < osm_node_get_num_physp(p_osm_node); i++) {
2721 p_osm_port = osm_node_get_physp_ptr(p_osm_node, i);
2722 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
2726 osm_node_get_remote_node(p_osm_node, i, NULL);
2727 if (!p_remote_osm_node)
2730 switch (osm_node_get_type(p_remote_osm_node)) {
2731 case IB_NODE_TYPE_CA:
2732 /* HCA connected directly to another HCA - not FatTree */
2733 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB0F: "
2734 "CA conected directly to another CA: "
2735 "0x%016" PRIx64 " <---> 0x%016" PRIx64 "\n",
2736 __osm_ftree_hca_get_guid_ho(p_hca),
2737 cl_ntoh64(osm_node_get_node_guid
2738 (p_remote_osm_node)));
2742 case IB_NODE_TYPE_ROUTER:
2743 /* leaving this port - proceeding to the next one */
2746 case IB_NODE_TYPE_SWITCH:
2747 /* continue with this port */
2751 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
2752 "ERR AB10: Node GUID 0x%016" PRIx64
2753 " - Unknown node type: %s\n",
2754 cl_ntoh64(osm_node_get_node_guid
2755 (p_remote_osm_node)),
2756 ib_get_node_type_str(osm_node_get_type
2757 (p_remote_osm_node)));
2762 /* remote node is switch */
2764 p_sw = __osm_ftree_fabric_get_sw_by_guid(p_ftree,
2765 osm_node_get_node_guid
2771 /* if needed, rank the remote switch and add it to the BFS list */
2773 if (!__osm_ftree_sw_update_rank(p_sw, 0))
2775 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2776 "Marking rank of switch that is directly connected to CA:\n"
2777 " - CA guid : 0x%016"
2779 " - Switch guid: 0x%016"
2781 " - Switch LID : %u\n",
2782 __osm_ftree_hca_get_guid_ho(p_hca),
2783 __osm_ftree_sw_get_guid_ho(p_sw),
2784 cl_ntoh16(p_sw->base_lid));
2785 cl_list_insert_tail(p_ranking_bfs_list, p_sw);
2789 OSM_LOG_EXIT(&p_ftree->p_osm->log);
2791 } /* __osm_ftree_rank_leaf_switches() */
2793 /***************************************************/
2795 static void __osm_ftree_sw_reverse_rank(IN cl_map_item_t * const p_map_item,
2798 ftree_fabric_t *p_ftree = (ftree_fabric_t *) context;
2799 ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item;
2800 p_sw->rank = p_ftree->max_switch_rank - p_sw->rank;
2803 /***************************************************
2804 ***************************************************/
2807 __osm_ftree_fabric_construct_hca_ports(IN ftree_fabric_t * p_ftree,
2808 IN ftree_hca_t * p_hca)
2810 ftree_sw_t *p_remote_sw;
2811 osm_node_t *p_node = p_hca->p_osm_node;
2812 osm_node_t *p_remote_node;
2813 uint8_t remote_node_type;
2814 ib_net64_t remote_node_guid;
2815 osm_physp_t *p_remote_osm_port;
2817 uint8_t remote_port_num;
2818 boolean_t is_cn = FALSE;
2821 for (i = 0; i < osm_node_get_num_physp(p_node); i++) {
2822 osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i);
2823 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
2826 p_remote_osm_port = osm_physp_get_remote(p_osm_port);
2828 osm_node_get_remote_node(p_node, i, &remote_port_num);
2830 if (!p_remote_osm_port)
2833 remote_node_type = osm_node_get_type(p_remote_node);
2834 remote_node_guid = osm_node_get_node_guid(p_remote_node);
2836 switch (remote_node_type) {
2837 case IB_NODE_TYPE_ROUTER:
2838 /* leaving this port - proceeding to the next one */
2841 case IB_NODE_TYPE_CA:
2842 /* HCA connected directly to another HCA - not FatTree */
2843 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB11: "
2844 "CA conected directly to another CA: "
2845 "0x%016" PRIx64 " <---> 0x%016" PRIx64 "\n",
2846 cl_ntoh64(osm_node_get_node_guid(p_node)),
2847 cl_ntoh64(remote_node_guid));
2851 case IB_NODE_TYPE_SWITCH:
2852 /* continue with this port */
2856 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
2857 "ERR AB12: Node GUID 0x%016" PRIx64
2858 " - Unknown node type: %s\n",
2859 cl_ntoh64(remote_node_guid),
2860 ib_get_node_type_str(remote_node_type));
2865 /* remote node is switch */
2868 __osm_ftree_fabric_get_sw_by_guid(p_ftree,
2870 CL_ASSERT(p_remote_sw);
2872 /* If CN file is not supplied, then all the CAs considered as Compute Nodes.
2873 Otherwise all the CAs are not CNs, and only guids that are present in the
2874 CN file will be marked as compute nodes. */
2875 if (!__osm_ftree_fabric_cns_provided(p_ftree)) {
2878 name_map_item_t *p_elem =
2879 (name_map_item_t *) cl_qmap_get(&p_ftree->
2881 cl_ntoh64(osm_physp_get_port_guid
2884 (name_map_item_t *) cl_qmap_end(&p_ftree->
2892 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2893 "Marking CN port GUID 0x%016" PRIx64 "\n",
2894 cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
2896 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2897 "Marking non-CN port GUID 0x%016" PRIx64 "\n",
2898 cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
2901 __osm_ftree_hca_add_port(p_hca, /* local ftree_hca object */
2902 i, /* local port number */
2903 remote_port_num, /* remote port number */
2904 osm_node_get_base_lid(p_node, i), /* local lid */
2905 osm_node_get_base_lid(p_remote_node, 0), /* remote lid */
2906 osm_physp_get_port_guid(p_osm_port), /* local port guid */
2907 osm_physp_get_port_guid(p_remote_osm_port), /* remote port guid */
2908 remote_node_guid, /* remote node guid */
2909 remote_node_type, /* remote node type */
2910 (void *)p_remote_sw, /* remote ftree_hca/sw object */
2911 is_cn); /* whether this port is compute node */
2916 } /* __osm_ftree_fabric_construct_hca_ports() */
2918 /***************************************************
2919 ***************************************************/
2921 static int __osm_ftree_fabric_construct_sw_ports(IN ftree_fabric_t * p_ftree,
2922 IN ftree_sw_t * p_sw)
2924 ftree_hca_t *p_remote_hca;
2925 ftree_sw_t *p_remote_sw;
2926 osm_node_t *p_node = p_sw->p_osm_sw->p_node;
2927 osm_node_t *p_remote_node;
2928 ib_net16_t remote_base_lid;
2929 uint8_t remote_node_type;
2930 ib_net64_t remote_node_guid;
2931 osm_physp_t *p_remote_osm_port;
2932 ftree_direction_t direction;
2933 void *p_remote_hca_or_sw;
2935 uint8_t remote_port_num;
2938 CL_ASSERT(osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH);
2940 for (i = 1; i < osm_node_get_num_physp(p_node); i++) {
2941 osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i);
2942 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
2945 p_remote_osm_port = osm_physp_get_remote(p_osm_port);
2946 if (!p_remote_osm_port)
2950 osm_node_get_remote_node(p_node, i, &remote_port_num);
2952 /* ignore any loopback connection on switch */
2953 if (p_node == p_remote_node) {
2954 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2955 "Ignoring loopback on switch GUID 0x%016" PRIx64
2956 ", LID %u, rank %u\n",
2957 __osm_ftree_sw_get_guid_ho(p_sw),
2958 cl_ntoh16(p_sw->base_lid),
2963 remote_node_type = osm_node_get_type(p_remote_node);
2964 remote_node_guid = osm_node_get_node_guid(p_remote_node);
2966 switch (remote_node_type) {
2967 case IB_NODE_TYPE_ROUTER:
2968 /* leaving this port - proceeding to the next one */
2971 case IB_NODE_TYPE_CA:
2972 /* switch connected to hca */
2975 __osm_ftree_fabric_get_hca_by_guid(p_ftree,
2977 CL_ASSERT(p_remote_hca);
2979 p_remote_hca_or_sw = (void *)p_remote_hca;
2980 direction = FTREE_DIRECTION_DOWN;
2983 osm_physp_get_base_lid(p_remote_osm_port);
2986 case IB_NODE_TYPE_SWITCH:
2987 /* switch connected to another switch */
2990 __osm_ftree_fabric_get_sw_by_guid(p_ftree,
2992 CL_ASSERT(p_remote_sw);
2994 p_remote_hca_or_sw = (void *)p_remote_sw;
2996 if (abs(p_sw->rank - p_remote_sw->rank) != 1) {
2997 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
2999 "Illegal link between switches with ranks %u and %u:\n"
3000 " GUID 0x%016" PRIx64
3001 ", LID %u, rank %u\n"
3002 " GUID 0x%016" PRIx64
3003 ", LID %u, rank %u\n", p_sw->rank,
3005 __osm_ftree_sw_get_guid_ho(p_sw),
3006 cl_ntoh16(p_sw->base_lid), p_sw->rank,
3007 __osm_ftree_sw_get_guid_ho(p_remote_sw),
3008 cl_ntoh16(p_remote_sw->base_lid),
3014 if (p_sw->rank > p_remote_sw->rank)
3015 direction = FTREE_DIRECTION_UP;
3017 direction = FTREE_DIRECTION_DOWN;
3019 /* switch LID is only in port 0 port_info structure */
3021 osm_node_get_base_lid(p_remote_node, 0);
3026 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3027 "ERR AB13: Node GUID 0x%016" PRIx64
3028 " - Unknown node type: %s\n",
3029 cl_ntoh64(remote_node_guid),
3030 ib_get_node_type_str(remote_node_type));
3034 __osm_ftree_sw_add_port(p_sw, /* local ftree_sw object */
3035 i, /* local port number */
3036 remote_port_num, /* remote port number */
3037 p_sw->base_lid, /* local lid */
3038 remote_base_lid, /* remote lid */
3039 osm_physp_get_port_guid(p_osm_port), /* local port guid */
3040 osm_physp_get_port_guid(p_remote_osm_port), /* remote port guid */
3041 remote_node_guid, /* remote node guid */
3042 remote_node_type, /* remote node type */
3043 p_remote_hca_or_sw, /* remote ftree_hca/sw object */
3044 direction); /* port direction (up or down) */
3046 /* Track the max lid (in host order) that exists in the fabric */
3047 if (cl_ntoh16(remote_base_lid) > p_ftree->lft_max_lid_ho)
3048 p_ftree->lft_max_lid_ho = cl_ntoh16(remote_base_lid);
3053 } /* __osm_ftree_fabric_construct_sw_ports() */
3055 /***************************************************
3056 ***************************************************/
3058 static int __osm_ftree_fabric_rank_from_roots(IN ftree_fabric_t * p_ftree)
3060 osm_node_t *p_osm_node;
3061 osm_node_t *p_remote_osm_node;
3062 osm_physp_t *p_osm_physp;
3064 ftree_sw_t *p_remote_sw;
3065 cl_list_t ranking_bfs_list;
3066 struct guid_list_item *item;
3069 unsigned max_rank = 0;
3072 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3073 cl_list_init(&ranking_bfs_list, 10);
3075 /* Rank all the roots and add them to list */
3076 for (item = (void *)cl_qlist_head(&p_ftree->root_guid_list);
3077 item != (void *)cl_qlist_end(&p_ftree->root_guid_list);
3078 item = (void *)cl_qlist_next(&item->list)) {
3080 __osm_ftree_fabric_get_sw_by_guid(p_ftree,
3081 cl_hton64(item->guid));
3083 /* the specified root guid wasn't found in the fabric */
3084 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB24: "
3085 "Root switch GUID 0x%" PRIx64 " not found\n",
3090 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3091 "Ranking root switch with GUID 0x%" PRIx64 "\n",
3094 cl_list_insert_tail(&ranking_bfs_list, p_sw);
3097 num_roots = cl_list_count(&ranking_bfs_list);
3099 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB25: "
3100 "No valid roots supplied\n");
3105 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3106 "Ranked %u valid root switches\n", num_roots);
3108 /* Now the list has all the roots.
3109 BFS the subnet and update rank on all the switches. */
3111 while (!cl_is_list_empty(&ranking_bfs_list)) {
3112 p_sw = (ftree_sw_t *) cl_list_remove_head(&ranking_bfs_list);
3113 p_osm_node = p_sw->p_osm_sw->p_node;
3115 /* note: skipping port 0 on switches */
3116 for (i = 1; i < osm_node_get_num_physp(p_osm_node); i++) {
3117 p_osm_physp = osm_node_get_physp_ptr(p_osm_node, i);
3118 if (!p_osm_physp || !osm_link_is_healthy(p_osm_physp))
3122 osm_node_get_remote_node(p_osm_node, i, NULL);
3123 if (!p_remote_osm_node)
3126 if (osm_node_get_type(p_remote_osm_node) !=
3127 IB_NODE_TYPE_SWITCH)
3130 p_remote_sw = __osm_ftree_fabric_get_sw_by_guid(p_ftree,
3131 osm_node_get_node_guid
3132 (p_remote_osm_node));
3133 CL_ASSERT(p_remote_sw);
3135 /* if needed, rank the remote switch and add it to the BFS list */
3136 if (__osm_ftree_sw_update_rank
3137 (p_remote_sw, p_sw->rank + 1)) {
3138 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3139 "Ranking switch 0x%" PRIx64
3141 __osm_ftree_sw_get_guid_ho(p_remote_sw),
3143 max_rank = p_remote_sw->rank;
3144 cl_list_insert_tail(&ranking_bfs_list,
3148 /* done with ports of this switch - go to the next switch in the list */
3151 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3152 "Subnet ranking completed. Max Node Rank = %u\n", max_rank);
3154 /* set FatTree maximal switch rank */
3155 p_ftree->max_switch_rank = max_rank;
3158 cl_list_destroy(&ranking_bfs_list);
3159 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3161 } /* __osm_ftree_fabric_rank_from_roots() */
3163 /***************************************************
3164 ***************************************************/
3166 static int __osm_ftree_fabric_rank_from_hcas(IN ftree_fabric_t * p_ftree)
3169 ftree_hca_t *p_next_hca;
3170 cl_list_t ranking_bfs_list;
3173 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3175 cl_list_init(&ranking_bfs_list, 10);
3177 /* Mark REVERSED rank of all the switches in the subnet.
3178 Start from switches that are connected to hca's, and
3179 scan all the switches in the subnet. */
3180 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3181 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3183 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3184 if (__osm_ftree_rank_leaf_switches
3185 (p_ftree, p_hca, &ranking_bfs_list) != 0) {
3187 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB14: "
3188 "Subnet ranking failed - subnet is not FatTree");
3193 /* Now rank rest of the switches in the fabric, while the
3194 list already contains all the ranked leaf switches */
3195 __osm_ftree_rank_switches_from_leafs(p_ftree, &ranking_bfs_list);
3197 /* fix ranking of the switches by reversing the ranking direction */
3198 cl_qmap_apply_func(&p_ftree->sw_tbl, __osm_ftree_sw_reverse_rank,
3202 cl_list_destroy(&ranking_bfs_list);
3203 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3205 } /* __osm_ftree_fabric_rank_from_hcas() */
3207 /***************************************************
3208 ***************************************************/
3210 static int __osm_ftree_fabric_rank(IN ftree_fabric_t * p_ftree)
3214 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3216 if (__osm_ftree_fabric_roots_provided(p_ftree))
3217 res = __osm_ftree_fabric_rank_from_roots(p_ftree);
3219 res = __osm_ftree_fabric_rank_from_hcas(p_ftree);
3224 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
3225 "FatTree max switch rank is %u\n", p_ftree->max_switch_rank);
3228 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3230 } /* __osm_ftree_fabric_rank() */
3232 /***************************************************
3233 ***************************************************/
3235 static void __osm_ftree_fabric_set_leaf_rank(IN ftree_fabric_t * p_ftree)
3239 ftree_hca_t *p_hca = NULL;
3240 ftree_hca_t *p_next_hca;
3242 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3244 if (!__osm_ftree_fabric_roots_provided(p_ftree)) {
3245 /* If root file is not provided, the fabric has to be pure fat-tree
3246 in terms of ranking. Thus, leaf switches rank is the max rank. */
3247 p_ftree->leaf_switch_rank = p_ftree->max_switch_rank;
3249 /* Find the first CN and set the leaf_switch_rank to the rank
3250 of the switch that is connected to this CN. Later we will
3251 ensure that all the leaf switches have the same rank. */
3252 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3253 while (p_next_hca !=
3254 (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3259 (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3261 /* we know that there are CNs in the fabric, so just to be sure... */
3262 CL_ASSERT(p_next_hca !=
3263 (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl));
3265 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3266 "Selected CN port GUID 0x%" PRIx64 "\n",
3267 __osm_ftree_hca_get_guid_ho(p_hca));
3269 for (i = 0; (i < p_hca->up_port_groups_num)
3270 && (!p_hca->up_port_groups[i]->is_cn); i++) ;
3271 CL_ASSERT(i < p_hca->up_port_groups_num);
3272 CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type ==
3273 IB_NODE_TYPE_SWITCH);
3275 p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw;
3276 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3277 "Selected leaf switch GUID 0x%" PRIx64 ", rank %u\n",
3278 __osm_ftree_sw_get_guid_ho(p_sw), p_sw->rank);
3279 p_ftree->leaf_switch_rank = p_sw->rank;
3282 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
3283 "FatTree leaf switch rank is %u\n", p_ftree->leaf_switch_rank);
3284 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3285 } /* __osm_ftree_fabric_set_leaf_rank() */
3287 /***************************************************
3288 ***************************************************/
3290 static int __osm_ftree_fabric_populate_ports(IN ftree_fabric_t * p_ftree)
3293 ftree_hca_t *p_next_hca;
3295 ftree_sw_t *p_next_sw;
3298 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3300 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3301 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3303 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3304 if (__osm_ftree_fabric_construct_hca_ports(p_ftree, p_hca) != 0) {
3310 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3311 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
3313 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
3314 if (__osm_ftree_fabric_construct_sw_ports(p_ftree, p_sw) != 0) {
3320 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3322 } /* __osm_ftree_fabric_populate_ports() */
3324 /***************************************************
3325 ***************************************************/
3326 static int add_guid_item_to_list(void *cxt, uint64_t guid, char *p)
3328 cl_qlist_t *list = cxt;
3329 struct guid_list_item *item;
3331 item = malloc(sizeof(*item));
3336 cl_qlist_insert_tail(list, &item->list);
3341 static int add_guid_item_to_map(void *cxt, uint64_t guid, char *p)
3343 cl_qmap_t *map = cxt;
3344 name_map_item_t *item;
3346 item = malloc(sizeof(*item));
3351 cl_qmap_insert(map, guid, &item->item);
3356 static int __osm_ftree_fabric_read_guid_files(IN ftree_fabric_t * p_ftree)
3360 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3362 if (__osm_ftree_fabric_roots_provided(p_ftree)) {
3363 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3364 "Fetching root nodes from file %s\n",
3365 p_ftree->p_osm->subn.opt.root_guid_file);
3367 if (parse_node_map(p_ftree->p_osm->subn.opt.root_guid_file,
3368 add_guid_item_to_list,
3369 &p_ftree->root_guid_list)) {
3374 if (!cl_qlist_count(&p_ftree->root_guid_list)) {
3375 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB22: "
3376 "Root guids file has no valid guids\n");
3382 if (__osm_ftree_fabric_cns_provided(p_ftree)) {
3383 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3384 "Fetching compute nodes from file %s\n",
3385 p_ftree->p_osm->subn.opt.cn_guid_file);
3387 if (parse_node_map(p_ftree->p_osm->subn.opt.cn_guid_file,
3388 add_guid_item_to_map,
3389 &p_ftree->cn_guid_tbl)) {
3394 if (!cl_qmap_count(&p_ftree->cn_guid_tbl)) {
3395 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB23: "
3396 "Compute node guids file has no valid guids\n");
3403 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3405 } /*__osm_ftree_fabric_read_guid_files() */
3407 /***************************************************
3408 ***************************************************/
3410 static int __osm_ftree_construct_fabric(IN void *context)
3412 ftree_fabric_t *p_ftree = context;
3415 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3417 __osm_ftree_fabric_clear(p_ftree);
3419 if (p_ftree->p_osm->subn.opt.lmc > 0) {
3420 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3421 "LMC > 0 is not supported by fat-tree routing.\n"
3422 "Falling back to default routing\n");
3427 if (cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl) < 2) {
3428 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3429 "Fabric has %u switches - topology is not fat-tree.\n"
3430 "Falling back to default routing\n",
3431 cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl));
3436 if ((cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl) -
3437 cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl)) < 2) {
3438 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3439 "Fabric has %u nodes (%u switches) - topology is not fat-tree.\n"
3440 "Falling back to default routing\n",
3441 cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl),
3442 cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl));
3447 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n"
3448 " |----------------------------------------|\n"
3449 " |- Starting FatTree fabric construction -|\n"
3450 " |----------------------------------------|\n\n");
3452 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3453 "Populating FatTree Switch and CA tables\n");
3454 if (__osm_ftree_fabric_populate_nodes(p_ftree) != 0) {
3455 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3456 "Fabric topology is not fat-tree - "
3457 "falling back to default routing\n");
3462 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3463 "Reading guid files provided by user\n");
3464 if (__osm_ftree_fabric_read_guid_files(p_ftree) != 0) {
3465 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3466 "Failed reading guid files - "
3467 "falling back to default routing\n");
3472 if (cl_qmap_count(&p_ftree->hca_tbl) < 2) {
3473 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3474 "Fabric has %u CAa - topology is not fat-tree.\n"
3475 "Falling back to default routing\n",
3476 cl_qmap_count(&p_ftree->hca_tbl));
3481 /* Rank all the switches in the fabric.
3482 After that we will know only fabric max switch rank.
3483 We will be able to check leaf switches rank and the
3484 whole tree rank after filling ports and marking CNs. */
3485 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "Ranking FatTree\n");
3486 if (__osm_ftree_fabric_rank(p_ftree) != 0) {
3487 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3488 "Failed ranking the tree\n");
3493 /* For each hca and switch, construct array of ports.
3494 This is done after the whole FatTree data structure is ready,
3495 because we want the ports to have pointers to ftree_{sw,hca}_t
3496 objects, and we need the switches to be already ranked because
3497 that's how the port direction is determined. */
3498 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3499 "Populating CA & switch ports\n");
3500 if (__osm_ftree_fabric_populate_ports(p_ftree) != 0) {
3501 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3502 "Fabric topology is not a fat-tree\n");
3505 } else if (p_ftree->cn_num == 0) {
3506 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3507 "Fabric has no valid compute nodes\n");
3512 /* Now that the CA ports have been created and CNs were marked,
3513 we can complete the fabric ranking - set leaf switches rank. */
3514 __osm_ftree_fabric_set_leaf_rank(p_ftree);
3516 if (__osm_ftree_fabric_get_rank(p_ftree) > FAT_TREE_MAX_RANK ||
3517 __osm_ftree_fabric_get_rank(p_ftree) < FAT_TREE_MIN_RANK) {
3518 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3519 "Fabric rank is %u (should be between %u and %u)\n",
3520 __osm_ftree_fabric_get_rank(p_ftree), FAT_TREE_MIN_RANK,
3526 /* Mark all the switches in the fabric with rank equal to
3527 p_ftree->leaf_switch_rank and that are also connected to CNs.
3528 As a by-product, this function also runs basic topology
3529 validation - it checks that all the CNs are at the same rank. */
3530 if (__osm_ftree_fabric_mark_leaf_switches(p_ftree)) {
3531 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3532 "Fabric topology is not a fat-tree\n");
3537 /* Assign index to all the switches in the fabric.
3538 This function also sorts leaf switch array by the switch index,
3539 sorts all the port arrays of the indexed switches by remote
3540 switch index, and creates switch-by-tuple table (sw_by_tuple_tbl) */
3541 __osm_ftree_fabric_make_indexing(p_ftree);
3543 /* Create leaf switch array sorted by index.
3544 This array contains switches with rank equal to p_ftree->leaf_switch_rank
3545 and that are also connected to CNs (REAL leafs), and it may contain
3546 switches at the same leaf rank w/o CNs, if this is the order of indexing.
3547 In any case, the first and the last switches in the array are REAL leafs. */
3548 if (__osm_ftree_fabric_create_leaf_switch_array(p_ftree)) {
3549 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3550 "Fabric topology is not a fat-tree\n");
3555 /* calculate and set ftree.max_cn_per_leaf field */
3556 __osm_ftree_fabric_set_max_cn_per_leaf(p_ftree);
3558 /* print general info about fabric topology */
3559 __osm_ftree_fabric_dump_general_info(p_ftree);
3561 /* dump full tree topology */
3562 if (osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
3563 __osm_ftree_fabric_dump(p_ftree);
3565 /* the fabric is required to be PURE fat-tree only if the root
3566 guid file hasn't been provided by user */
3567 if (!__osm_ftree_fabric_roots_provided(p_ftree) &&
3568 !__osm_ftree_fabric_validate_topology(p_ftree)) {
3569 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3570 "Fabric topology is not a fat-tree\n");
3575 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3576 "Max LID in switch LFTs: %u\n",
3577 p_ftree->lft_max_lid_ho);
3581 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3582 "Clearing FatTree Fabric data structures\n");
3583 __osm_ftree_fabric_clear(p_ftree);
3585 p_ftree->fabric_built = TRUE;
3587 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n"
3588 " |--------------------------------------------------|\n"
3589 " |- Done constructing FatTree fabric (status = %d) -|\n"
3590 " |--------------------------------------------------|\n\n",
3593 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3595 } /* __osm_ftree_construct_fabric() */
3597 /***************************************************
3598 ***************************************************/
3600 static int __osm_ftree_do_routing(IN void *context)
3602 ftree_fabric_t *p_ftree = context;
3605 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3607 if (!p_ftree->fabric_built) {
3612 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3613 "Starting FatTree routing\n");
3615 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3616 "Filling switch forwarding tables for Compute Nodes\n");
3617 __osm_ftree_fabric_route_to_cns(p_ftree);
3619 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3620 "Filling switch forwarding tables for non-CN targets\n");
3621 __osm_ftree_fabric_route_to_non_cns(p_ftree);
3623 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3624 "Filling switch forwarding tables for switch-to-switch paths\n");
3625 __osm_ftree_fabric_route_to_switches(p_ftree);
3627 /* for each switch, set its fwd table */
3628 cl_qmap_apply_func(&p_ftree->sw_tbl, __osm_ftree_set_sw_fwd_table,
3631 /* write out hca ordering file */
3632 __osm_ftree_fabric_dump_hca_ordering(p_ftree);
3634 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3635 "FatTree routing is done\n");
3638 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3642 /***************************************************
3643 ***************************************************/
3645 static void __osm_ftree_delete(IN void *context)
3649 __osm_ftree_fabric_destroy((ftree_fabric_t *) context);
3652 /***************************************************
3653 ***************************************************/
3655 int osm_ucast_ftree_setup(struct osm_routing_engine *r, osm_opensm_t * p_osm)
3657 ftree_fabric_t *p_ftree = __osm_ftree_fabric_create();
3661 p_ftree->p_osm = p_osm;
3663 r->context = (void *)p_ftree;
3664 r->build_lid_matrices = __osm_ftree_construct_fabric;
3665 r->ucast_build_fwd_tables = __osm_ftree_do_routing;
3666 r->delete = __osm_ftree_delete;