2 * Copyright (c) 2009 Simula Research Laboratory. All rights reserved.
3 * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
4 * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
5 * Copyright (c) 2002-2011 Mellanox Technologies LTD. All rights reserved.
6 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
8 * This software is available to you under a choice of one of two
9 * licenses. You may choose to be licensed under the terms of the GNU
10 * General Public License (GPL) Version 2, available from the file
11 * COPYING in the main directory of this source tree, or the
12 * OpenIB.org BSD license below:
14 * Redistribution and use in source and binary forms, with or
15 * without modification, are permitted provided that the following
18 * - Redistributions of source code must retain the above
19 * copyright notice, this list of conditions and the following
22 * - Redistributions in binary form must reproduce the above
23 * copyright notice, this list of conditions and the following
24 * disclaimer in the documentation and/or other materials
25 * provided with the distribution.
27 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
28 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
29 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
30 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
31 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
32 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
33 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
40 * Implementation of OpenSM FatTree routing
51 #include <iba/ib_types.h>
52 #include <complib/cl_qmap.h>
53 #include <complib/cl_debug.h>
54 #include <opensm/osm_file_ids.h>
55 #define FILE_ID OSM_FILE_UCAST_FTREE_C
56 #include <opensm/osm_opensm.h>
57 #include <opensm/osm_switch.h>
60 * FatTree rank is bounded between 2 and 8:
61 * - Tree of rank 1 has only trivial routing paths,
62 * so no need to use FatTree routing.
63 * - Why maximum rank is 8:
64 * Each node (switch) is assigned a unique tuple.
65 * Switches are stored in two cl_qmaps - one is
66 * ordered by guid, and the other by a key that is
67 * generated from tuple. Since cl_qmap supports only
68 * a 64-bit key, the maximal tuple length is 8 bytes.
69 * which means that maximal tree rank is 8.
70 * Note that the above also implies that each switch
71 * can have at max 255 up/down ports.
74 #define FAT_TREE_MIN_RANK 2
75 #define FAT_TREE_MAX_RANK 8
78 FTREE_DIRECTION_DOWN = -1,
83 /***************************************************
87 ***************************************************/
91 struct ftree_port_group_t_;
92 struct ftree_fabric_t_;
94 /***************************************************
96 ** ftree_tuple_t definition
98 ***************************************************/
100 #define FTREE_TUPLE_BUFF_LEN 1024
101 #define FTREE_TUPLE_LEN 8
103 typedef uint8_t ftree_tuple_t[FTREE_TUPLE_LEN];
104 typedef uint64_t ftree_tuple_key_t;
106 /***************************************************
108 ** ftree_sw_table_element_t definition
110 ***************************************************/
113 cl_map_item_t map_item;
114 struct ftree_sw_t_ *p_sw;
115 } ftree_sw_tbl_element_t;
117 /***************************************************
119 ** ftree_port_t definition
121 ***************************************************/
123 typedef struct ftree_port_t_ {
124 cl_map_item_t map_item;
125 uint8_t port_num; /* port number on the current node */
126 uint8_t remote_port_num; /* port number on the remote node */
127 uint32_t counter_up; /* number of allocated routes upwards */
128 uint32_t counter_down; /* number of allocated routes downwards */
131 /***************************************************
133 ** ftree_port_group_t definition
135 ***************************************************/
137 typedef union ftree_hca_or_sw_ {
138 struct ftree_hca_t_ *p_hca;
139 struct ftree_sw_t_ *p_sw;
142 typedef struct ftree_port_group_t_ {
143 cl_map_item_t map_item;
144 uint16_t lid; /* lid of the current node */
145 uint16_t remote_lid; /* lid of the remote node */
146 ib_net64_t port_guid; /* port guid of this port */
147 ib_net64_t node_guid; /* this node's guid */
148 uint8_t node_type; /* this node's type */
149 ib_net64_t remote_port_guid; /* port guid of the remote port */
150 ib_net64_t remote_node_guid; /* node guid of the remote node */
151 uint8_t remote_node_type; /* IB_NODE_TYPE_{CA,SWITCH,ROUTER,...} */
152 ftree_hca_or_sw hca_or_sw; /* pointer to this hca/switch */
153 ftree_hca_or_sw remote_hca_or_sw; /* pointer to remote hca/switch */
154 cl_ptr_vector_t ports; /* vector of ports to the same lid */
155 boolean_t is_cn; /* whether this port is a compute node */
156 boolean_t is_io; /* whether this port is an I/O node */
157 uint32_t counter_down; /* number of allocated routes downwards */
158 uint32_t counter_up; /* number of allocated routes upwards */
159 } ftree_port_group_t;
161 /***************************************************
163 ** ftree_sw_t definition
165 ***************************************************/
167 typedef struct ftree_sw_t_ {
168 cl_map_item_t map_item;
169 osm_switch_t *p_osm_sw;
173 ftree_port_group_t **down_port_groups;
174 uint8_t down_port_groups_num;
175 ftree_port_group_t **sibling_port_groups;
176 uint8_t sibling_port_groups_num;
177 ftree_port_group_t **up_port_groups;
178 uint8_t up_port_groups_num;
180 unsigned down_port_groups_idx;
182 uint32_t min_counter_down;
183 boolean_t counter_up_changed;
186 /***************************************************
188 ** ftree_hca_t definition
190 ***************************************************/
192 typedef struct ftree_hca_t_ {
193 cl_map_item_t map_item;
194 osm_node_t *p_osm_node;
195 ftree_port_group_t **up_port_groups;
196 uint8_t *disconnected_ports;
197 uint16_t up_port_groups_num;
201 /***************************************************
203 ** ftree_fabric_t definition
205 ***************************************************/
207 typedef struct ftree_fabric_t_ {
212 cl_qmap_t sw_by_tuple_tbl;
213 cl_qmap_t cn_guid_tbl;
214 cl_qmap_t io_guid_tbl;
217 uint8_t leaf_switch_rank;
218 uint8_t max_switch_rank;
219 ftree_sw_t **leaf_switches;
220 uint32_t leaf_switches_num;
221 uint16_t max_cn_per_leaf;
222 uint16_t lft_max_lid;
223 boolean_t fabric_built;
226 static inline osm_subn_t *ftree_get_subnet(IN ftree_fabric_t * p_ftree)
228 return p_ftree->p_subn;
231 /***************************************************
235 ***************************************************/
237 static int compare_switches_by_index(IN const void *p1, IN const void *p2)
239 ftree_sw_t **pp_sw1 = (ftree_sw_t **) p1;
240 ftree_sw_t **pp_sw2 = (ftree_sw_t **) p2;
243 for (i = 0; i < FTREE_TUPLE_LEN; i++) {
244 if ((*pp_sw1)->tuple[i] > (*pp_sw2)->tuple[i])
246 if ((*pp_sw1)->tuple[i] < (*pp_sw2)->tuple[i])
252 /***************************************************/
255 compare_port_groups_by_remote_switch_index(IN const void *p1, IN const void *p2)
257 ftree_port_group_t **pp_g1 = (ftree_port_group_t **) p1;
258 ftree_port_group_t **pp_g2 = (ftree_port_group_t **) p2;
261 compare_switches_by_index(&((*pp_g1)->remote_hca_or_sw.p_sw),
262 &((*pp_g2)->remote_hca_or_sw.p_sw));
265 /***************************************************
267 ** ftree_tuple_t functions
269 ***************************************************/
271 static void tuple_init(IN ftree_tuple_t tuple)
273 memset(tuple, 0xFF, FTREE_TUPLE_LEN);
276 /***************************************************/
278 static inline boolean_t tuple_assigned(IN ftree_tuple_t tuple)
280 return (tuple[0] != 0xFF);
283 /***************************************************/
285 #define FTREE_TUPLE_BUFFERS_NUM 6
287 static const char *tuple_to_str(IN ftree_tuple_t tuple)
289 static char buffer[FTREE_TUPLE_BUFFERS_NUM][FTREE_TUPLE_BUFF_LEN];
290 static uint8_t ind = 0;
294 if (!tuple_assigned(tuple))
295 return "INDEX.NOT.ASSIGNED";
297 buffer[ind][0] = '\0';
299 for (i = 0; (i < FTREE_TUPLE_LEN) && (tuple[i] != 0xFF); i++) {
300 if ((strlen(buffer[ind]) + 10) > FTREE_TUPLE_BUFF_LEN)
301 return "INDEX.TOO.LONG";
303 strcat(buffer[ind], ".");
304 sprintf(&buffer[ind][strlen(buffer[ind])], "%u", tuple[i]);
307 ret_buffer = buffer[ind];
308 ind = (ind + 1) % FTREE_TUPLE_BUFFERS_NUM;
310 } /* tuple_to_str() */
312 /***************************************************/
314 static inline ftree_tuple_key_t tuple_to_key(IN ftree_tuple_t tuple)
316 ftree_tuple_key_t key;
317 memcpy(&key, tuple, FTREE_TUPLE_LEN);
321 /***************************************************/
323 static inline void tuple_from_key(IN ftree_tuple_t tuple,
324 IN ftree_tuple_key_t key)
326 memcpy(tuple, &key, FTREE_TUPLE_LEN);
329 /***************************************************
331 ** ftree_sw_tbl_element_t functions
333 ***************************************************/
335 static ftree_sw_tbl_element_t *sw_tbl_element_create(IN ftree_sw_t * p_sw)
337 ftree_sw_tbl_element_t *p_element =
338 (ftree_sw_tbl_element_t *) malloc(sizeof(ftree_sw_tbl_element_t));
341 memset(p_element, 0, sizeof(ftree_sw_tbl_element_t));
343 p_element->p_sw = p_sw;
347 /***************************************************/
349 static void sw_tbl_element_destroy(IN ftree_sw_tbl_element_t * p_element)
354 /***************************************************
356 ** ftree_port_t functions
358 ***************************************************/
360 static ftree_port_t *port_create(IN uint8_t port_num,
361 IN uint8_t remote_port_num)
363 ftree_port_t *p_port = (ftree_port_t *) malloc(sizeof(ftree_port_t));
366 memset(p_port, 0, sizeof(ftree_port_t));
368 p_port->port_num = port_num;
369 p_port->remote_port_num = remote_port_num;
374 /***************************************************/
376 static void port_destroy(IN ftree_port_t * p_port)
381 /***************************************************
383 ** ftree_port_group_t functions
385 ***************************************************/
387 static ftree_port_group_t *port_group_create(IN uint16_t lid,
388 IN uint16_t remote_lid,
389 IN ib_net64_t port_guid,
390 IN ib_net64_t node_guid,
391 IN uint8_t node_type,
392 IN void *p_hca_or_sw,
393 IN ib_net64_t remote_port_guid,
394 IN ib_net64_t remote_node_guid,
395 IN uint8_t remote_node_type,
396 IN void *p_remote_hca_or_sw,
400 ftree_port_group_t *p_group =
401 (ftree_port_group_t *) malloc(sizeof(ftree_port_group_t));
404 memset(p_group, 0, sizeof(ftree_port_group_t));
407 p_group->remote_lid = remote_lid;
408 memcpy(&p_group->port_guid, &port_guid, sizeof(ib_net64_t));
409 memcpy(&p_group->node_guid, &node_guid, sizeof(ib_net64_t));
410 memcpy(&p_group->remote_port_guid, &remote_port_guid,
412 memcpy(&p_group->remote_node_guid, &remote_node_guid,
415 p_group->node_type = node_type;
417 case IB_NODE_TYPE_CA:
418 p_group->hca_or_sw.p_hca = (ftree_hca_t *) p_hca_or_sw;
420 case IB_NODE_TYPE_SWITCH:
421 p_group->hca_or_sw.p_sw = (ftree_sw_t *) p_hca_or_sw;
424 /* we shouldn't get here - port is created only in hca or switch */
428 p_group->remote_node_type = remote_node_type;
429 switch (remote_node_type) {
430 case IB_NODE_TYPE_CA:
431 p_group->remote_hca_or_sw.p_hca =
432 (ftree_hca_t *) p_remote_hca_or_sw;
434 case IB_NODE_TYPE_SWITCH:
435 p_group->remote_hca_or_sw.p_sw =
436 (ftree_sw_t *) p_remote_hca_or_sw;
439 /* we shouldn't get here - port is created only in hca or switch */
443 cl_ptr_vector_init(&p_group->ports, 0, /* min size */
445 p_group->is_cn = is_cn;
446 p_group->is_io = is_io;
448 } /* port_group_create() */
450 /***************************************************/
452 static void port_group_destroy(IN ftree_port_group_t * p_group)
456 ftree_port_t *p_port;
461 /* remove all the elements of p_group->ports vector */
462 size = cl_ptr_vector_get_size(&p_group->ports);
463 for (i = 0; i < size; i++)
464 if (cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port) == CL_SUCCESS)
465 port_destroy(p_port);
467 cl_ptr_vector_destroy(&p_group->ports);
469 } /* port_group_destroy() */
471 /***************************************************/
473 static void port_group_dump(IN ftree_fabric_t * p_ftree,
474 IN ftree_port_group_t * p_group,
475 IN ftree_direction_t direction)
477 ftree_port_t *p_port;
485 if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
488 size = cl_ptr_vector_get_size(&p_group->ports);
490 buff = calloc(10, 1024);
492 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB33: "
493 "Failed to allocate buffer\n");
497 for (i = 0; i < size; i++) {
498 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
503 sprintf(buff + strlen(buff), "%u", p_port->port_num);
506 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
507 " Port Group of size %u, port(s): %s, direction: %s\n"
508 " Local <--> Remote GUID (LID):"
509 "0x%016" PRIx64 " (0x%04x) <--> 0x%016" PRIx64 " (0x%04x)\n",
511 (direction == FTREE_DIRECTION_DOWN) ? "DOWN" : (direction ==
512 FTREE_DIRECTION_SAME)
513 ? "SIBLING" : "UP", cl_ntoh64(p_group->port_guid),
514 p_group->lid, cl_ntoh64(p_group->remote_port_guid),
515 p_group->remote_lid);
519 } /* port_group_dump() */
521 /***************************************************/
523 static void port_group_add_port(IN ftree_port_group_t * p_group,
524 IN uint8_t port_num, IN uint8_t remote_port_num)
527 ftree_port_t *p_port;
529 for (i = 0; i < cl_ptr_vector_get_size(&p_group->ports); i++) {
530 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
531 if (p_port->port_num == port_num)
535 p_port = port_create(port_num, remote_port_num);
537 cl_ptr_vector_insert(&p_group->ports, p_port, NULL);
540 /***************************************************
542 ** ftree_sw_t functions
544 ***************************************************/
546 static ftree_sw_t *sw_create(IN osm_switch_t * p_osm_sw)
551 /* make sure that the switch has ports */
552 if (p_osm_sw->num_ports == 1)
555 p_sw = (ftree_sw_t *) malloc(sizeof(ftree_sw_t));
558 memset(p_sw, 0, sizeof(ftree_sw_t));
560 p_sw->p_osm_sw = p_osm_sw;
561 p_sw->rank = 0xFFFFFFFF;
562 tuple_init(p_sw->tuple);
565 cl_ntoh16(osm_node_get_base_lid(p_sw->p_osm_sw->p_node, 0));
567 ports_num = osm_node_get_num_physp(p_sw->p_osm_sw->p_node);
568 p_sw->down_port_groups =
569 (ftree_port_group_t **) malloc(ports_num *
570 sizeof(ftree_port_group_t *));
571 if (p_sw->down_port_groups == NULL)
573 memset(p_sw->down_port_groups, 0, ports_num * sizeof(ftree_port_group_t *));
575 p_sw->up_port_groups =
576 (ftree_port_group_t **) malloc(ports_num *
577 sizeof(ftree_port_group_t *));
578 if (p_sw->up_port_groups == NULL)
580 memset(p_sw->up_port_groups, 0, ports_num * sizeof(ftree_port_group_t *));
582 p_sw->sibling_port_groups =
583 (ftree_port_group_t **) malloc(ports_num *
584 sizeof(ftree_port_group_t *));
585 if (p_sw->sibling_port_groups == NULL)
587 memset(p_sw->sibling_port_groups, 0, ports_num * sizeof(ftree_port_group_t *));
589 /* initialize lft buffer */
590 memset(p_osm_sw->new_lft, OSM_NO_PATH, p_osm_sw->lft_size);
591 p_sw->hops = malloc((p_osm_sw->max_lid_ho + 1) * sizeof(*(p_sw->hops)));
592 if (p_sw->hops == NULL)
595 memset(p_sw->hops, OSM_NO_PATH, p_osm_sw->max_lid_ho + 1);
600 free(p_sw->sibling_port_groups);
602 free(p_sw->up_port_groups);
604 free(p_sw->down_port_groups);
610 /***************************************************/
612 static void sw_destroy(IN ftree_sw_t * p_sw)
620 for (i = 0; i < p_sw->down_port_groups_num; i++)
621 port_group_destroy(p_sw->down_port_groups[i]);
622 for (i = 0; i < p_sw->sibling_port_groups_num; i++)
623 port_group_destroy(p_sw->sibling_port_groups[i]);
624 for (i = 0; i < p_sw->up_port_groups_num; i++)
625 port_group_destroy(p_sw->up_port_groups[i]);
626 free(p_sw->down_port_groups);
627 free(p_sw->sibling_port_groups);
628 free(p_sw->up_port_groups);
633 /***************************************************/
635 static uint64_t sw_get_guid_no(IN ftree_sw_t * p_sw)
639 return osm_node_get_node_guid(p_sw->p_osm_sw->p_node);
642 /***************************************************/
644 static uint64_t sw_get_guid_ho(IN ftree_sw_t * p_sw)
646 return cl_ntoh64(sw_get_guid_no(p_sw));
649 /***************************************************/
651 static void sw_dump(IN ftree_fabric_t * p_ftree, IN ftree_sw_t * p_sw)
658 if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
661 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
662 "Switch index: %s, GUID: 0x%016" PRIx64
663 ", Ports: %u DOWN, %u SIBLINGS, %u UP\n",
664 tuple_to_str(p_sw->tuple), sw_get_guid_ho(p_sw),
665 p_sw->down_port_groups_num, p_sw->sibling_port_groups_num,
666 p_sw->up_port_groups_num);
668 for (i = 0; i < p_sw->down_port_groups_num; i++)
669 port_group_dump(p_ftree, p_sw->down_port_groups[i],
670 FTREE_DIRECTION_DOWN);
671 for (i = 0; i < p_sw->sibling_port_groups_num; i++)
672 port_group_dump(p_ftree, p_sw->sibling_port_groups[i],
673 FTREE_DIRECTION_SAME);
674 for (i = 0; i < p_sw->up_port_groups_num; i++)
675 port_group_dump(p_ftree, p_sw->up_port_groups[i],
680 /***************************************************/
682 static boolean_t sw_ranked(IN ftree_sw_t * p_sw)
684 return (p_sw->rank != 0xFFFFFFFF);
687 /***************************************************/
689 static ftree_port_group_t *sw_get_port_group_by_remote_lid(IN ftree_sw_t * p_sw,
697 ftree_port_group_t **port_groups;
699 if (direction == FTREE_DIRECTION_UP) {
700 port_groups = p_sw->up_port_groups;
701 size = p_sw->up_port_groups_num;
702 } else if (direction == FTREE_DIRECTION_SAME) {
703 port_groups = p_sw->sibling_port_groups;
704 size = p_sw->sibling_port_groups_num;
706 port_groups = p_sw->down_port_groups;
707 size = p_sw->down_port_groups_num;
710 for (i = 0; i < size; i++)
711 if (remote_lid == port_groups[i]->remote_lid)
712 return port_groups[i];
715 } /* sw_get_port_group_by_remote_lid() */
717 /***************************************************/
719 static void sw_add_port(IN ftree_sw_t * p_sw, IN uint8_t port_num,
720 IN uint8_t remote_port_num, IN uint16_t lid,
721 IN uint16_t remote_lid, IN ib_net64_t port_guid,
722 IN ib_net64_t remote_port_guid,
723 IN ib_net64_t remote_node_guid,
724 IN uint8_t remote_node_type,
725 IN void *p_remote_hca_or_sw,
726 IN ftree_direction_t direction)
728 ftree_port_group_t *p_group =
729 sw_get_port_group_by_remote_lid(p_sw, remote_lid, direction);
732 p_group = port_group_create(lid, remote_lid,
733 port_guid, sw_get_guid_no(p_sw),
734 IB_NODE_TYPE_SWITCH, p_sw,
735 remote_port_guid, remote_node_guid,
737 p_remote_hca_or_sw, FALSE, FALSE);
740 if (direction == FTREE_DIRECTION_UP) {
741 p_sw->up_port_groups[p_sw->up_port_groups_num++] =
743 } else if (direction == FTREE_DIRECTION_SAME) {
745 sibling_port_groups[p_sw->sibling_port_groups_num++]
748 p_sw->down_port_groups[p_sw->down_port_groups_num++] =
751 port_group_add_port(p_group, port_num, remote_port_num);
753 } /* sw_add_port() */
755 /***************************************************/
757 static inline cl_status_t sw_set_hops(IN ftree_sw_t * p_sw, IN uint16_t lid,
758 IN uint8_t port_num, IN uint8_t hops,
759 IN boolean_t is_target_sw)
761 /* set local min hop table(LID) */
762 p_sw->hops[lid] = hops;
764 return osm_switch_set_hops(p_sw->p_osm_sw, lid, port_num, hops);
768 /***************************************************/
770 static int set_hops_on_remote_sw(IN ftree_port_group_t * p_group,
771 IN uint16_t target_lid, IN uint8_t hops,
772 IN boolean_t is_target_sw)
774 ftree_port_t *p_port;
775 uint8_t i, ports_num;
776 ftree_sw_t *p_remote_sw = p_group->remote_hca_or_sw.p_sw;
778 /* if lid is a switch, we set the min hop table in the osm_switch struct */
779 CL_ASSERT(p_group->remote_node_type == IB_NODE_TYPE_SWITCH);
780 p_remote_sw->hops[target_lid] = hops;
782 /* If target lid is a switch we set the min hop table values
783 * for each port on the associated osm_sw struct */
787 ports_num = (uint8_t) cl_ptr_vector_get_size(&p_group->ports);
788 for (i = 0; i < ports_num; i++) {
789 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
790 if (sw_set_hops(p_remote_sw, target_lid,
791 p_port->remote_port_num, hops, is_target_sw))
797 /***************************************************/
799 static inline uint8_t
800 sw_get_least_hops(IN ftree_sw_t * p_sw, IN uint16_t target_lid)
802 CL_ASSERT(p_sw->hops != NULL);
803 return p_sw->hops[target_lid];
806 /***************************************************
808 ** ftree_hca_t functions
810 ***************************************************/
812 static ftree_hca_t *hca_create(IN osm_node_t * p_osm_node)
814 ftree_hca_t *p_hca = (ftree_hca_t *) malloc(sizeof(ftree_hca_t));
817 memset(p_hca, 0, sizeof(ftree_hca_t));
819 p_hca->p_osm_node = p_osm_node;
820 p_hca->up_port_groups = (ftree_port_group_t **)
821 malloc(osm_node_get_num_physp(p_hca->p_osm_node) *
822 sizeof(ftree_port_group_t *));
823 if (!p_hca->up_port_groups) {
827 memset(p_hca->up_port_groups, 0, osm_node_get_num_physp(p_hca->p_osm_node) *
828 sizeof(ftree_port_group_t *));
830 p_hca->disconnected_ports = (uint8_t *)
831 calloc(osm_node_get_num_physp(p_hca->p_osm_node) + 1, sizeof(uint8_t));
832 if (!p_hca->disconnected_ports) {
833 free(p_hca->up_port_groups);
837 p_hca->up_port_groups_num = 0;
841 /***************************************************/
843 static void hca_destroy(IN ftree_hca_t * p_hca)
850 for (i = 0; i < p_hca->up_port_groups_num; i++)
851 port_group_destroy(p_hca->up_port_groups[i]);
853 free(p_hca->up_port_groups);
854 free(p_hca->disconnected_ports);
859 /***************************************************/
861 static uint64_t hca_get_guid_no(IN ftree_hca_t * p_hca)
865 return osm_node_get_node_guid(p_hca->p_osm_node);
868 /***************************************************/
870 static uint64_t hca_get_guid_ho(IN ftree_hca_t * p_hca)
872 return cl_ntoh64(hca_get_guid_no(p_hca));
875 /***************************************************/
877 static void hca_dump(IN ftree_fabric_t * p_ftree, IN ftree_hca_t * p_hca)
884 if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
887 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
888 "CA GUID: 0x%016" PRIx64 ", Ports: %u UP\n",
889 hca_get_guid_ho(p_hca), p_hca->up_port_groups_num);
891 for (i = 0; i < p_hca->up_port_groups_num; i++)
892 port_group_dump(p_ftree, p_hca->up_port_groups[i],
896 static ftree_port_group_t *hca_get_port_group_by_lid(IN ftree_hca_t *
902 for (i = 0; i < p_hca->up_port_groups_num; i++)
904 p_hca->up_port_groups[i]->lid)
905 return p_hca->up_port_groups[i];
909 /***************************************************/
911 static void hca_add_port(IN ftree_fabric_t * p_ftree,
912 IN ftree_hca_t * p_hca, IN uint8_t port_num,
913 IN uint8_t remote_port_num, IN uint16_t lid,
914 IN uint16_t remote_lid, IN ib_net64_t port_guid,
915 IN ib_net64_t remote_port_guid,
916 IN ib_net64_t remote_node_guid,
917 IN uint8_t remote_node_type,
918 IN void *p_remote_hca_or_sw, IN boolean_t is_cn,
921 ftree_port_group_t *p_group;
923 /* this function is supposed to be called only for adding ports
924 in hca's that lead to switches */
925 CL_ASSERT(remote_node_type == IB_NODE_TYPE_SWITCH);
927 p_group = hca_get_port_group_by_lid(p_hca, lid);
930 p_group = port_group_create(lid, remote_lid,
931 port_guid, hca_get_guid_no(p_hca),
932 IB_NODE_TYPE_CA, p_hca,
933 remote_port_guid, remote_node_guid,
935 p_remote_hca_or_sw, is_cn, is_io);
937 p_hca->up_port_groups[p_hca->up_port_groups_num++] = p_group;
938 port_group_add_port(p_group, port_num, remote_port_num);
940 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
941 "ERR AB32: Duplicated LID for CA GUID: 0x%016" PRIx64 "\n",
942 cl_ntoh64(port_guid));
943 } /* hca_add_port() */
945 /***************************************************
947 ** ftree_fabric_t functions
949 ***************************************************/
951 static ftree_fabric_t *fabric_create()
953 ftree_fabric_t *p_ftree =
954 (ftree_fabric_t *) malloc(sizeof(ftree_fabric_t));
958 memset(p_ftree, 0, sizeof(ftree_fabric_t));
960 cl_qmap_init(&p_ftree->hca_tbl);
961 cl_qmap_init(&p_ftree->sw_tbl);
962 cl_qmap_init(&p_ftree->sw_by_tuple_tbl);
963 cl_qmap_init(&p_ftree->cn_guid_tbl);
964 cl_qmap_init(&p_ftree->io_guid_tbl);
969 /***************************************************/
971 static void fabric_clear(ftree_fabric_t * p_ftree)
974 ftree_hca_t *p_next_hca;
976 ftree_sw_t *p_next_sw;
977 ftree_sw_tbl_element_t *p_element;
978 ftree_sw_tbl_element_t *p_next_element;
979 name_map_item_t *p_guid_element, *p_next_guid_element;
984 /* remove all the elements of hca_tbl */
986 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
987 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
989 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
992 cl_qmap_remove_all(&p_ftree->hca_tbl);
994 /* remove all the elements of sw_tbl */
996 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
997 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
999 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1002 cl_qmap_remove_all(&p_ftree->sw_tbl);
1004 /* remove all the elements of sw_by_tuple_tbl */
1007 (ftree_sw_tbl_element_t *) cl_qmap_head(&p_ftree->sw_by_tuple_tbl);
1008 while (p_next_element != (ftree_sw_tbl_element_t *)
1009 cl_qmap_end(&p_ftree->sw_by_tuple_tbl)) {
1010 p_element = p_next_element;
1011 p_next_element = (ftree_sw_tbl_element_t *)
1012 cl_qmap_next(&p_element->map_item);
1013 sw_tbl_element_destroy(p_element);
1015 cl_qmap_remove_all(&p_ftree->sw_by_tuple_tbl);
1017 /* remove all the elements of cn_guid_tbl */
1018 p_next_guid_element =
1019 (name_map_item_t *) cl_qmap_head(&p_ftree->cn_guid_tbl);
1020 while (p_next_guid_element !=
1021 (name_map_item_t *) cl_qmap_end(&p_ftree->cn_guid_tbl)) {
1022 p_guid_element = p_next_guid_element;
1023 p_next_guid_element =
1024 (name_map_item_t *) cl_qmap_next(&p_guid_element->item);
1025 free(p_guid_element);
1027 cl_qmap_remove_all(&p_ftree->cn_guid_tbl);
1029 /* remove all the elements of io_guid_tbl */
1030 p_next_guid_element =
1031 (name_map_item_t *) cl_qmap_head(&p_ftree->io_guid_tbl);
1032 while (p_next_guid_element !=
1033 (name_map_item_t *) cl_qmap_end(&p_ftree->io_guid_tbl)) {
1034 p_guid_element = p_next_guid_element;
1035 p_next_guid_element =
1036 (name_map_item_t *) cl_qmap_next(&p_guid_element->item);
1037 free(p_guid_element);
1039 cl_qmap_remove_all(&p_ftree->io_guid_tbl);
1041 /* free the leaf switches array */
1042 if ((p_ftree->leaf_switches_num > 0) && (p_ftree->leaf_switches))
1043 free(p_ftree->leaf_switches);
1045 p_ftree->leaf_switches_num = 0;
1046 p_ftree->cn_num = 0;
1047 p_ftree->ca_ports = 0;
1048 p_ftree->leaf_switch_rank = 0;
1049 p_ftree->max_switch_rank = 0;
1050 p_ftree->max_cn_per_leaf = 0;
1051 p_ftree->lft_max_lid = 0;
1052 p_ftree->leaf_switches = NULL;
1053 p_ftree->fabric_built = FALSE;
1055 } /* fabric_destroy() */
1057 /***************************************************/
1059 static void fabric_destroy(ftree_fabric_t * p_ftree)
1063 fabric_clear(p_ftree);
1067 /***************************************************/
1069 static uint8_t fabric_get_rank(ftree_fabric_t * p_ftree)
1071 return p_ftree->leaf_switch_rank + 1;
1074 /***************************************************/
1076 static void fabric_add_hca(ftree_fabric_t * p_ftree, osm_node_t * p_osm_node)
1080 CL_ASSERT(osm_node_get_type(p_osm_node) == IB_NODE_TYPE_CA);
1082 p_hca = hca_create(p_osm_node);
1086 cl_qmap_insert(&p_ftree->hca_tbl, p_osm_node->node_info.node_guid,
1090 /***************************************************/
1092 static void fabric_add_sw(ftree_fabric_t * p_ftree, osm_switch_t * p_osm_sw)
1096 CL_ASSERT(osm_node_get_type(p_osm_sw->p_node) == IB_NODE_TYPE_SWITCH);
1098 p_sw = sw_create(p_osm_sw);
1102 cl_qmap_insert(&p_ftree->sw_tbl, p_osm_sw->p_node->node_info.node_guid,
1105 /* track the max lid (in host order) that exists in the fabric */
1106 if (p_sw->lid > p_ftree->lft_max_lid)
1107 p_ftree->lft_max_lid = p_sw->lid;
1110 /***************************************************/
1112 static void fabric_add_sw_by_tuple(IN ftree_fabric_t * p_ftree,
1113 IN ftree_sw_t * p_sw)
1115 CL_ASSERT(tuple_assigned(p_sw->tuple));
1117 cl_qmap_insert(&p_ftree->sw_by_tuple_tbl, tuple_to_key(p_sw->tuple),
1118 &sw_tbl_element_create(p_sw)->map_item);
1121 /***************************************************/
1123 static ftree_sw_t *fabric_get_sw_by_tuple(IN ftree_fabric_t * p_ftree,
1124 IN ftree_tuple_t tuple)
1126 ftree_sw_tbl_element_t *p_element;
1128 CL_ASSERT(tuple_assigned(tuple));
1130 tuple_to_key(tuple);
1133 (ftree_sw_tbl_element_t *) cl_qmap_get(&p_ftree->sw_by_tuple_tbl,
1134 tuple_to_key(tuple));
1136 (ftree_sw_tbl_element_t *) cl_qmap_end(&p_ftree->sw_by_tuple_tbl))
1139 return p_element->p_sw;
1142 /***************************************************/
1144 static ftree_sw_t *fabric_get_sw_by_guid(IN ftree_fabric_t * p_ftree,
1148 p_sw = (ftree_sw_t *) cl_qmap_get(&p_ftree->sw_tbl, guid);
1149 if (p_sw == (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl))
1154 /***************************************************/
1156 static ftree_hca_t *fabric_get_hca_by_guid(IN ftree_fabric_t * p_ftree,
1160 p_hca = (ftree_hca_t *) cl_qmap_get(&p_ftree->hca_tbl, guid);
1161 if (p_hca == (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl))
1166 /***************************************************/
1168 static void fabric_dump(ftree_fabric_t * p_ftree)
1174 if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
1177 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n"
1178 " |-------------------------------|\n"
1179 " |- Full fabric topology dump -|\n"
1180 " |-------------------------------|\n\n");
1182 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "-- CAs:\n");
1184 for (p_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
1185 p_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl);
1186 p_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item)) {
1187 hca_dump(p_ftree, p_hca);
1190 for (i = 0; i <= p_ftree->max_switch_rank; i++) {
1191 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1192 "-- Rank %u switches\n", i);
1193 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1194 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1195 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1196 if (p_sw->rank == i)
1197 sw_dump(p_ftree, p_sw);
1201 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n"
1202 " |---------------------------------------|\n"
1203 " |- Full fabric topology dump completed -|\n"
1204 " |---------------------------------------|\n\n");
1205 } /* fabric_dump() */
1207 /***************************************************/
1209 static void fabric_dump_general_info(IN ftree_fabric_t * p_ftree)
1214 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1215 "General fabric topology info\n");
1216 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1217 "============================\n");
1219 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1220 " - FatTree rank (roots to leaf switches): %u\n",
1221 p_ftree->leaf_switch_rank + 1);
1222 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1223 " - FatTree max switch rank: %u\n", p_ftree->max_switch_rank);
1224 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1225 " - Fabric has %u CAs, %u CA ports (%u of them CNs), %u switches\n",
1226 cl_qmap_count(&p_ftree->hca_tbl), p_ftree->ca_ports,
1227 p_ftree->cn_num, cl_qmap_count(&p_ftree->sw_tbl));
1229 CL_ASSERT(p_ftree->ca_ports >= p_ftree->cn_num);
1231 for (i = 0; i <= p_ftree->max_switch_rank; i++) {
1233 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1234 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1235 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1236 if (p_sw->rank == i)
1240 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1241 " - Fabric has %u switches at rank %u (roots)\n",
1243 else if (i == p_ftree->leaf_switch_rank)
1244 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1245 " - Fabric has %u switches at rank %u (%u of them leafs)\n",
1246 j, i, p_ftree->leaf_switches_num);
1248 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1249 " - Fabric has %u switches at rank %u\n", j,
1253 if (OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_VERBOSE)) {
1254 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1255 " - Root switches:\n");
1256 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1257 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1258 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1259 if (p_sw->rank == 0)
1260 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1261 " GUID: 0x%016" PRIx64
1262 ", LID: %u, Index %s\n",
1263 sw_get_guid_ho(p_sw),
1265 tuple_to_str(p_sw->tuple));
1268 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1269 " - Leaf switches (sorted by index):\n");
1270 for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1271 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1272 " GUID: 0x%016" PRIx64
1273 ", LID: %u, Index %s\n",
1274 sw_get_guid_ho(p_ftree->leaf_switches[i]),
1275 p_ftree->leaf_switches[i]->lid,
1276 tuple_to_str(p_ftree->leaf_switches[i]->tuple));
1279 } /* fabric_dump_general_info() */
1281 /***************************************************/
1283 static void fabric_dump_hca_ordering(IN ftree_fabric_t * p_ftree)
1287 ftree_port_group_t *p_group_on_sw;
1288 ftree_port_group_t *p_group_on_hca;
1289 int rename_status = 0;
1292 unsigned printed_hcas_on_leaf;
1294 char path[1024], path_tmp[1032];
1295 FILE *p_hca_ordering_file;
1296 const char *filename = "opensm-ftree-ca-order.dump";
1298 snprintf(path, sizeof(path), "%s/%s",
1299 p_ftree->p_osm->subn.opt.dump_files_dir, filename);
1301 snprintf(path_tmp, sizeof(path_tmp), "%s.tmp", path);
1303 p_hca_ordering_file = fopen(path_tmp, "w");
1304 if (!p_hca_ordering_file) {
1305 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB01: "
1306 "cannot open file \'%s\': %s\n", path_tmp,
1311 /* for each leaf switch (in indexing order) */
1312 for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1313 p_sw = p_ftree->leaf_switches[i];
1314 printed_hcas_on_leaf = 0;
1316 /* for each real CA (CNs and not) connected to this switch */
1317 for (j = 0; j < p_sw->down_port_groups_num; j++) {
1318 p_group_on_sw = p_sw->down_port_groups[j];
1320 if (p_group_on_sw->remote_node_type != IB_NODE_TYPE_CA)
1323 p_hca = p_group_on_sw->remote_hca_or_sw.p_hca;
1325 hca_get_port_group_by_lid(p_hca,
1329 /* treat non-compute nodes as dummies */
1330 if (!p_group_on_hca->is_cn)
1333 fprintf(p_hca_ordering_file, "0x%04x\t%s\n",
1334 p_group_on_hca->lid,
1335 p_hca->p_osm_node->print_desc);
1337 printed_hcas_on_leaf++;
1340 /* now print missing HCAs */
1342 j < (p_ftree->max_cn_per_leaf - printed_hcas_on_leaf); j++)
1343 fprintf(p_hca_ordering_file, "0xFFFF\tDUMMY\n");
1346 /* done going through all the leaf switches */
1348 fclose(p_hca_ordering_file);
1350 rename_status = rename(path_tmp, path);
1351 if (rename_status) {
1352 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB03: "
1353 "cannot rename file \'%s\': %s\n", path_tmp,
1356 } /* fabric_dump_hca_ordering() */
1358 /***************************************************/
1360 static void fabric_assign_tuple(IN ftree_fabric_t * p_ftree,
1361 IN ftree_sw_t * p_sw,
1362 IN ftree_tuple_t new_tuple)
1364 memcpy(p_sw->tuple, new_tuple, FTREE_TUPLE_LEN);
1365 fabric_add_sw_by_tuple(p_ftree, p_sw);
1368 /***************************************************/
1370 static void fabric_assign_first_tuple(IN ftree_fabric_t * p_ftree,
1371 IN ftree_sw_t * p_sw,
1372 IN unsigned int subtree)
1375 ftree_tuple_t new_tuple;
1377 if (p_ftree->leaf_switch_rank >= FTREE_TUPLE_LEN)
1380 tuple_init(new_tuple);
1381 new_tuple[0] = (uint8_t) p_sw->rank;
1383 for (i = 1; i <= p_ftree->leaf_switch_rank; i++)
1386 if (p_sw->rank == 0) {
1387 if (p_ftree->leaf_switch_rank > 1)
1388 new_tuple[p_ftree->leaf_switch_rank] = subtree;
1390 for (i = 0; i < 0xFF; i++) {
1392 if (fabric_get_sw_by_tuple(p_ftree, new_tuple) == NULL)
1396 /* new tuple not found - there are more than 255 ports in one direction */
1400 fabric_assign_tuple(p_ftree, p_sw, new_tuple);
1403 /***************************************************/
1405 static void fabric_get_new_tuple(IN ftree_fabric_t * p_ftree,
1406 OUT ftree_tuple_t new_tuple,
1407 IN ftree_tuple_t from_tuple,
1408 IN ftree_direction_t direction)
1411 ftree_tuple_t temp_tuple;
1415 tuple_init(new_tuple);
1416 memcpy(temp_tuple, from_tuple, FTREE_TUPLE_LEN);
1418 if (direction == FTREE_DIRECTION_DOWN) {
1420 var_index = from_tuple[0] + 1;
1423 var_index = from_tuple[0];
1426 for (i = 0; i < 0xFF; i++) {
1427 temp_tuple[var_index] = i;
1428 p_sw = fabric_get_sw_by_tuple(p_ftree, temp_tuple);
1429 if (p_sw == NULL) /* found free tuple */
1434 /* new tuple not found - there are more than 255 ports in one direction */
1437 memcpy(new_tuple, temp_tuple, FTREE_TUPLE_LEN);
1439 } /* fabric_get_new_tuple() */
1441 /***************************************************/
1443 static inline boolean_t fabric_roots_provided(IN ftree_fabric_t * p_ftree)
1445 return (p_ftree->p_osm->subn.opt.root_guid_file != NULL);
1448 /***************************************************/
1450 static inline boolean_t fabric_cns_provided(IN ftree_fabric_t * p_ftree)
1452 return (p_ftree->p_osm->subn.opt.cn_guid_file != NULL);
1455 /***************************************************/
1457 static inline boolean_t fabric_ios_provided(IN ftree_fabric_t * p_ftree)
1459 return (p_ftree->p_osm->subn.opt.io_guid_file != NULL);
1462 /***************************************************/
1464 static int fabric_mark_leaf_switches(IN ftree_fabric_t * p_ftree)
1468 ftree_hca_t *p_next_hca;
1472 OSM_LOG_ENTER(&p_ftree->p_osm->log);
1474 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1475 "Marking leaf switches in fabric\n");
1477 /* Scan all the CAs, if they have CNs - find CN port and mark switch
1478 that is connected to this port as leaf switch.
1479 Also, ensure that this marked leaf has rank of p_ftree->leaf_switch_rank. */
1480 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
1481 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
1483 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
1487 for (i = 0; i < p_hca->up_port_groups_num; i++) {
1488 if (!p_hca->up_port_groups[i]->is_cn)
1491 /* In CAs, port group alway has one port, and since this
1492 port group is CN, we know that this port is compute node */
1493 CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type ==
1494 IB_NODE_TYPE_SWITCH);
1495 p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw;
1497 /* check if this switch was already processed */
1500 p_sw->is_leaf = TRUE;
1502 /* ensure that this leaf switch is at the correct tree level */
1503 if (p_sw->rank != p_ftree->leaf_switch_rank) {
1504 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1505 "ERR AB26: CN port 0x%" PRIx64
1506 " is connected to switch 0x%" PRIx64
1508 "while FatTree leaf rank is %u\n",
1510 up_port_groups[i]->port_guid),
1511 sw_get_guid_ho(p_sw), p_sw->rank,
1512 p_ftree->leaf_switch_rank);
1521 OSM_LOG_EXIT(&p_ftree->p_osm->log);
1523 } /* fabric_mark_leaf_switches() */
1525 /***************************************************/
1526 static void bfs_fabric_indexing(IN ftree_fabric_t * p_ftree,
1527 IN ftree_sw_t *p_first_sw)
1529 ftree_sw_t *p_remote_sw;
1530 ftree_sw_t *p_sw = NULL;
1531 ftree_tuple_t new_tuple;
1535 OSM_LOG_ENTER(&p_ftree->p_osm->log);
1536 cl_list_init(&bfs_list, cl_qmap_count(&p_ftree->sw_tbl));
1538 * Now run BFS and assign indexes to all switches
1539 * Pseudo code of the algorithm is as follows:
1541 * * Add first switch to BFS queue
1542 * * While (BFS queue not empty)
1543 * - Pop the switch from the head of the queue
1544 * - Scan all the downward and upward ports
1546 * + Get the remote switch
1547 * + Assign index to the remote switch
1548 * + Add remote switch to the BFS queue
1551 cl_list_insert_tail(&bfs_list, p_first_sw);
1553 while (!cl_is_list_empty(&bfs_list)) {
1554 p_sw = (ftree_sw_t *) cl_list_remove_head(&bfs_list);
1556 /* Discover all the nodes from ports that are pointing down */
1558 if (p_sw->rank >= p_ftree->leaf_switch_rank) {
1559 /* whether downward ports are pointing to CAs or switches,
1560 we don't assign indexes to switches that are located
1561 lower than leaf switches */
1563 /* This is not the leaf switch */
1564 for (i = 0; i < p_sw->down_port_groups_num; i++) {
1565 /* Work with port groups that are pointing to switches only.
1566 No need to assign indexing to HCAs */
1568 down_port_groups[i]->remote_node_type !=
1569 IB_NODE_TYPE_SWITCH)
1573 p_sw->down_port_groups[i]->
1574 remote_hca_or_sw.p_sw;
1575 if (tuple_assigned(p_remote_sw->tuple)) {
1576 /* this switch has been already indexed */
1579 /* allocate new tuple */
1580 fabric_get_new_tuple(p_ftree, new_tuple,
1582 FTREE_DIRECTION_DOWN);
1583 /* Assign the new tuple to the remote switch.
1584 This fuction also adds the switch into the switch_by_tuple table. */
1585 fabric_assign_tuple(p_ftree, p_remote_sw,
1588 /* add the newly discovered switch to the BFS queue */
1589 cl_list_insert_tail(&bfs_list, p_remote_sw);
1591 /* Done assigning indexes to all the remote switches
1592 that are pointed by the downgoing ports.
1593 Now sort port groups according to remote index. */
1594 qsort(p_sw->down_port_groups, /* array */
1595 p_sw->down_port_groups_num, /* number of elements */
1596 sizeof(ftree_port_group_t *), /* size of each element */
1597 compare_port_groups_by_remote_switch_index); /* comparator */
1600 /* Done indexing switches from ports that go down.
1601 Now do the same with ports that are pointing up.
1602 if we started from root (rank == 0), the leaf is bsf termination point */
1604 if (p_sw->rank != 0 && (p_first_sw->rank != 0 || !p_sw->is_leaf)) {
1605 /* This is not the root switch, which means that all the ports
1606 that are pointing up are taking us to another switches. */
1607 for (i = 0; i < p_sw->up_port_groups_num; i++) {
1609 p_sw->up_port_groups[i]->
1610 remote_hca_or_sw.p_sw;
1611 if (tuple_assigned(p_remote_sw->tuple))
1613 /* allocate new tuple */
1614 fabric_get_new_tuple(p_ftree, new_tuple,
1616 FTREE_DIRECTION_UP);
1617 /* Assign the new tuple to the remote switch.
1618 This fuction also adds the switch to the
1619 switch_by_tuple table. */
1620 fabric_assign_tuple(p_ftree,
1621 p_remote_sw, new_tuple);
1622 /* add the newly discovered switch to the BFS queue */
1623 cl_list_insert_tail(&bfs_list, p_remote_sw);
1625 /* Done assigning indexes to all the remote switches
1626 that are pointed by the upgoing ports.
1627 Now sort port groups according to remote index. */
1628 qsort(p_sw->up_port_groups, /* array */
1629 p_sw->up_port_groups_num, /* number of elements */
1630 sizeof(ftree_port_group_t *), /* size of each element */
1631 compare_port_groups_by_remote_switch_index); /* comparator */
1633 /* Done assigning indexes to all the switches that are directly connected
1634 to the current switch - go to the next switch in the BFS queue */
1636 cl_list_destroy(&bfs_list);
1638 OSM_LOG_EXIT(&p_ftree->p_osm->log);
1641 static void fabric_make_indexing(IN ftree_fabric_t * p_ftree)
1643 ftree_sw_t *p_sw = NULL;
1644 unsigned int subtree = 0;
1645 OSM_LOG_ENTER(&p_ftree->p_osm->log);
1647 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1648 "Starting FatTree indexing\n");
1650 /* using the first switch as a starting point for indexing algorithm. */
1651 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1652 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1653 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1654 if (ftree_get_subnet(p_ftree)->opt.quasi_ftree_indexing) {
1655 /* find first root switch */
1656 if (p_sw->rank != 0)
1659 /* find first leaf switch */
1663 /* Assign the first tuple to the switch that is used as BFS starting point
1665 The tuple will be as follows: [rank].0...0.subtree
1666 This fuction also adds the switch it into the switch_by_tuple table. */
1667 if (!tuple_assigned(p_sw->tuple)) {
1668 fabric_assign_first_tuple(p_ftree, p_sw, subtree++);
1669 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1670 "Indexing starting point:\n"
1671 " - Switch rank : %u\n"
1672 " - Switch index : %s\n"
1673 " - Node LID : %u\n"
1674 " - Node GUID : 0x%016"
1675 PRIx64 "\n", p_sw->rank, tuple_to_str(p_sw->tuple),
1676 p_sw->lid, sw_get_guid_ho(p_sw));
1679 bfs_fabric_indexing(p_ftree, p_sw);
1681 if (ftree_get_subnet(p_ftree)->opt.quasi_ftree_indexing == FALSE)
1684 p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1685 while (p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1686 if (p_sw->is_leaf) {
1687 qsort(p_sw->up_port_groups, /* array */
1688 p_sw->up_port_groups_num, /* number of elements */
1689 sizeof(ftree_port_group_t *), /* size of each element */
1690 compare_port_groups_by_remote_switch_index); /* comparator */
1692 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1696 OSM_LOG_EXIT(&p_ftree->p_osm->log);
1697 } /* fabric_make_indexing() */
1698 /***************************************************/
1700 static int fabric_create_leaf_switch_array(IN ftree_fabric_t * p_ftree)
1703 ftree_sw_t *p_next_sw;
1704 ftree_sw_t **all_switches_at_leaf_level;
1706 unsigned all_leaf_idx = 0;
1707 unsigned first_leaf_idx;
1708 unsigned last_leaf_idx;
1711 OSM_LOG_ENTER(&p_ftree->p_osm->log);
1713 /* create array of ALL the switches that have leaf rank */
1714 all_switches_at_leaf_level = (ftree_sw_t **)
1715 malloc(cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *));
1716 if (!all_switches_at_leaf_level) {
1717 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_SYS, FILE_ID,
1718 "Fat-tree routing: Memory allocation failed\n");
1722 memset(all_switches_at_leaf_level, 0,
1723 cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *));
1725 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1726 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1728 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1729 if (p_sw->rank == p_ftree->leaf_switch_rank) {
1730 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1731 "Adding switch 0x%" PRIx64
1732 " to full leaf switch array\n",
1733 sw_get_guid_ho(p_sw));
1734 all_switches_at_leaf_level[all_leaf_idx++] = p_sw;
1738 /* quick-sort array of leaf switches by index */
1739 qsort(all_switches_at_leaf_level, /* array */
1740 all_leaf_idx, /* number of elements */
1741 sizeof(ftree_sw_t *), /* size of each element */
1742 compare_switches_by_index); /* comparator */
1744 /* check the first and the last REAL leaf (the one
1745 that has CNs) in the array of all the leafs */
1747 first_leaf_idx = all_leaf_idx;
1749 for (i = 0; i < all_leaf_idx; i++) {
1750 if (all_switches_at_leaf_level[i]->is_leaf) {
1751 if (i < first_leaf_idx)
1757 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1758 "Full leaf array info: first_leaf_idx = %u, last_leaf_idx = %u\n",
1759 first_leaf_idx, last_leaf_idx);
1761 if (first_leaf_idx >= last_leaf_idx) {
1762 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
1763 "Failed to find leaf switches - topology is not "
1769 /* Create array of REAL leaf switches, sorted by index.
1770 This array may contain switches at the same rank w/o CNs,
1771 in case this is the order of indexing. */
1772 p_ftree->leaf_switches_num = last_leaf_idx - first_leaf_idx + 1;
1773 p_ftree->leaf_switches = (ftree_sw_t **)
1774 malloc(p_ftree->leaf_switches_num * sizeof(ftree_sw_t *));
1775 if (!p_ftree->leaf_switches) {
1776 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_SYS, FILE_ID,
1777 "Fat-tree routing: Memory allocation failed\n");
1782 memcpy(p_ftree->leaf_switches,
1783 &(all_switches_at_leaf_level[first_leaf_idx]),
1784 p_ftree->leaf_switches_num * sizeof(ftree_sw_t *));
1786 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1787 "Created array of %u leaf switches\n",
1788 p_ftree->leaf_switches_num);
1791 free(all_switches_at_leaf_level);
1792 OSM_LOG_EXIT(&p_ftree->p_osm->log);
1794 } /* fabric_create_leaf_switch_array() */
1796 /***************************************************/
1798 static void fabric_set_max_cn_per_leaf(IN ftree_fabric_t * p_ftree)
1802 unsigned cns_on_this_leaf;
1804 ftree_port_group_t *p_group, *p_up_group;
1807 for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1808 p_sw = p_ftree->leaf_switches[i];
1809 cns_on_this_leaf = 0;
1810 for (j = 0; j < p_sw->down_port_groups_num; j++) {
1811 p_group = p_sw->down_port_groups[j];
1812 if (p_group->remote_node_type != IB_NODE_TYPE_CA)
1814 p_hca = p_group->remote_hca_or_sw.p_hca;
1816 * Get the hca port group corresponding
1817 * to the LID of remote HCA port
1819 p_up_group = hca_get_port_group_by_lid(p_hca,
1820 p_group->remote_lid);
1822 CL_ASSERT(p_up_group);
1824 if (p_up_group->is_cn)
1827 if (cns_on_this_leaf > p_ftree->max_cn_per_leaf)
1828 p_ftree->max_cn_per_leaf = cns_on_this_leaf;
1830 } /* fabric_set_max_cn_per_leaf() */
1832 /***************************************************/
1834 static boolean_t fabric_validate_topology(IN ftree_fabric_t * p_ftree)
1836 ftree_port_group_t *p_group;
1837 ftree_port_group_t *p_ref_group;
1839 ftree_sw_t *p_next_sw;
1840 ftree_sw_t **reference_sw_arr;
1841 uint16_t tree_rank = fabric_get_rank(p_ftree);
1842 boolean_t res = TRUE;
1845 OSM_LOG_ENTER(&p_ftree->p_osm->log);
1847 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1848 "Validating fabric topology\n");
1851 (ftree_sw_t **) malloc(tree_rank * sizeof(ftree_sw_t *));
1852 if (reference_sw_arr == NULL) {
1853 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_SYS, FILE_ID,
1854 "Fat-tree routing: Memory allocation failed\n");
1857 memset(reference_sw_arr, 0, tree_rank * sizeof(ftree_sw_t *));
1859 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1860 while (res && p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1862 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1864 if (!reference_sw_arr[p_sw->rank])
1865 /* This is the first switch in the current level that
1866 we're checking - use it as a reference */
1867 reference_sw_arr[p_sw->rank] = p_sw;
1869 /* compare this switch properties to the reference switch */
1871 if (reference_sw_arr[p_sw->rank]->up_port_groups_num !=
1872 p_sw->up_port_groups_num) {
1873 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1874 "ERR AB09: Different number of upward port groups on switches:\n"
1875 " GUID 0x%016" PRIx64
1876 ", LID %u, Index %s - %u groups\n"
1877 " GUID 0x%016" PRIx64
1878 ", LID %u, Index %s - %u groups\n",
1880 (reference_sw_arr[p_sw->rank]),
1881 reference_sw_arr[p_sw->rank]->lid,
1883 (reference_sw_arr[p_sw->rank]->tuple),
1884 reference_sw_arr[p_sw->
1887 sw_get_guid_ho(p_sw), p_sw->lid,
1888 tuple_to_str(p_sw->tuple),
1889 p_sw->up_port_groups_num);
1894 if (p_sw->rank != (tree_rank - 1) &&
1895 reference_sw_arr[p_sw->
1896 rank]->down_port_groups_num !=
1897 p_sw->down_port_groups_num) {
1898 /* we're allowing some hca's to be missing */
1899 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1900 "ERR AB0A: Different number of downward port groups on switches:\n"
1901 " GUID 0x%016" PRIx64
1902 ", LID %u, Index %s - %u port groups\n"
1903 " GUID 0x%016" PRIx64
1904 ", LID %u, Index %s - %u port groups\n",
1906 (reference_sw_arr[p_sw->rank]),
1907 reference_sw_arr[p_sw->rank]->lid,
1909 (reference_sw_arr[p_sw->rank]->tuple),
1910 reference_sw_arr[p_sw->
1912 down_port_groups_num,
1913 sw_get_guid_ho(p_sw), p_sw->lid,
1914 tuple_to_str(p_sw->tuple),
1915 p_sw->down_port_groups_num);
1920 if (reference_sw_arr[p_sw->rank]->up_port_groups_num !=
1923 reference_sw_arr[p_sw->
1924 rank]->up_port_groups[0];
1925 for (i = 0; i < p_sw->up_port_groups_num; i++) {
1926 p_group = p_sw->up_port_groups[i];
1927 if (cl_ptr_vector_get_size
1928 (&p_ref_group->ports) !=
1929 cl_ptr_vector_get_size
1930 (&p_group->ports)) {
1931 OSM_LOG(&p_ftree->p_osm->log,
1933 "ERR AB0B: Different number of ports in an upward port group on switches:\n"
1936 ", LID %u, Index %s - %u ports\n"
1939 ", LID %u, Index %s - %u ports\n",
1943 reference_sw_arr[p_sw->
1948 [p_sw->rank]->tuple),
1949 cl_ptr_vector_get_size
1950 (&p_ref_group->ports),
1951 sw_get_guid_ho(p_sw),
1955 cl_ptr_vector_get_size
1962 if (reference_sw_arr[p_sw->rank]->down_port_groups_num
1963 != 0 && p_sw->rank != (tree_rank - 1)) {
1964 /* we're allowing some hca's to be missing */
1966 reference_sw_arr[p_sw->
1967 rank]->down_port_groups[0];
1968 for (i = 0; i < p_sw->down_port_groups_num; i++) {
1969 p_group = p_sw->down_port_groups[0];
1970 if (cl_ptr_vector_get_size
1971 (&p_ref_group->ports) !=
1972 cl_ptr_vector_get_size
1973 (&p_group->ports)) {
1974 OSM_LOG(&p_ftree->p_osm->log,
1976 "ERR AB0C: Different number of ports in an downward port group on switches:\n"
1979 ", LID %u, Index %s - %u ports\n"
1982 ", LID %u, Index %s - %u ports\n",
1986 reference_sw_arr[p_sw->
1991 [p_sw->rank]->tuple),
1992 cl_ptr_vector_get_size
1993 (&p_ref_group->ports),
1994 sw_get_guid_ho(p_sw),
1998 cl_ptr_vector_get_size
2006 } /* end of while */
2009 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
2010 "Fabric topology has been identified as FatTree\n");
2012 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
2013 "ERR AB0D: Fabric topology hasn't been identified as FatTree\n");
2015 free(reference_sw_arr);
2016 OSM_LOG_EXIT(&p_ftree->p_osm->log);
2018 } /* fabric_validate_topology() */
2020 /***************************************************
2021 ***************************************************/
2023 static void set_sw_fwd_table(IN cl_map_item_t * const p_map_item,
2026 ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item;
2027 ftree_fabric_t *p_ftree = (ftree_fabric_t *) context;
2029 p_sw->p_osm_sw->max_lid_ho = p_ftree->lft_max_lid;
2032 /***************************************************
2033 ***************************************************/
2036 * Function: Finds the least loaded port group and stores its counter
2039 static inline void recalculate_min_counter_down(ftree_sw_t * p_sw)
2041 uint32_t min = (1 << 30);
2043 for (i = 0; i < p_sw->down_port_groups_num; i++) {
2044 if (p_sw->down_port_groups[i]->counter_down < min) {
2045 min = p_sw->down_port_groups[i]->counter_down;
2048 p_sw->min_counter_down = min;
2053 * Function: Return the counter value of the least loaded down port group
2056 static inline uint32_t find_lowest_loaded_group_on_sw(ftree_sw_t * p_sw)
2058 return p_sw->min_counter_down;
2062 * Function: Compare the load of two port groups and return which is the least loaded
2063 * Given : Two port groups with remote switch
2064 * When both port groups are equally loaded, it picks the one whom
2065 * remote switch down ports are least loaded.
2066 * This way, it prefers the switch from where it will be easier to go down (creating upward routes).
2067 * If both are equal, it picks the lowest INDEX to be deterministic.
2069 static inline int port_group_compare_load_down(const ftree_port_group_t * p1,
2070 const ftree_port_group_t * p2)
2072 int temp = p1->counter_down - p2->counter_down;
2078 /* Find the less loaded remote sw and choose this one */
2081 find_lowest_loaded_group_on_sw(p1->remote_hca_or_sw.p_sw);
2083 find_lowest_loaded_group_on_sw(p2->remote_hca_or_sw.p_sw);
2084 temp = load1 - load2;
2088 /* If they are both equal, choose the lowest index */
2089 return compare_port_groups_by_remote_switch_index(&p1, &p2);
2092 static inline int port_group_compare_load_up(const ftree_port_group_t * p1,
2093 const ftree_port_group_t * p2)
2095 int temp = p1->counter_up - p2->counter_up;
2101 /* If they are both equal, choose the lowest index */
2102 return compare_port_groups_by_remote_switch_index (&p1,&p2);
2106 * Function: Sorts an array of port group by up load order
2107 * Given : A port group array and its length
2108 * As the list is mostly sorted, we used a bubble sort instead of qsort
2109 * as it is much faster.
2112 * This function and bubble_sort_down must NOT be factorized.
2113 * Although most of the code is the same and a function pointer could be used
2114 * for the compareason function, it would prevent the compareason function to be inlined
2115 * and cost a great deal to performances.
2118 bubble_sort_up(ftree_port_group_t ** p_group_array, uint32_t nmemb)
2122 ftree_port_group_t *tmp = p_group_array[0];
2124 /* As this function is a great number of times, we only go into the loop
2125 * if one of the port counters has changed, thus saving some tests */
2126 if (tmp->hca_or_sw.p_sw->counter_up_changed == FALSE) {
2129 /* While we did modifications on the array order */
2130 /* i may grew above array length but next loop will fail and tmp will be null for the next time
2131 * this way we save a test i < nmemb for each pass through the loop */
2132 for (i = 0; tmp; i++) {
2133 /* Assume the array is orderd */
2135 /* Comparing elements j and j-1 */
2136 for (j = 1; j < (nmemb - i); j++) {
2137 /* If they are the wrong way around */
2138 if (port_group_compare_load_up(p_group_array[j],
2139 p_group_array[j - 1]) < 0) {
2140 /* We invert them */
2141 tmp = p_group_array[j - 1];
2142 p_group_array[j - 1] = p_group_array[j];
2143 p_group_array[j] = tmp;
2144 /* This sets tmp != NULL so the main loop will make another pass */
2149 /* We have reordered the array so as long noone changes the counter
2150 * it's not necessary to do it again */
2151 p_group_array[0]->hca_or_sw.p_sw->counter_up_changed = FALSE;
2155 bubble_sort_siblings(ftree_port_group_t ** p_group_array, uint32_t nmemb)
2159 ftree_port_group_t *tmp = p_group_array[0];
2161 /* While we did modifications on the array order */
2162 /* i may grew above array length but next loop will fail and tmp will be null for the next time
2163 * this way we save a test i < nmemb for each pass through the loop */
2164 for (i = 0; tmp != NULL; i++) {
2165 /* Assume the array is orderd */
2167 /* Comparing elements j and j-1 */
2168 for (j = 1; j < (nmemb - i); j++) {
2169 /* If they are the wrong way around */
2170 if (port_group_compare_load_up(p_group_array[j],
2171 p_group_array[j - 1]) < 0) {
2172 /* We invert them */
2173 tmp = p_group_array[j - 1];
2174 p_group_array[j - 1] = p_group_array[j];
2175 p_group_array[j] = tmp;
2182 * Function: Sorts an array of port group. Order is decide through
2183 * port_group_compare_load_down ( up counters, least load remote switch, biggest GUID)
2184 * Given : A port group array and its length. Each port group points to a remote switch (not a HCA)
2185 * As the list is mostly sorted, we used a bubble sort instead of qsort
2186 * as it is much faster.
2189 * This function and bubble_sort_up must NOT be factorized.
2190 * Although most of the code is the same and a function pointer could be used
2191 * for the compareason function, it would prevent the compareason function to be inlined
2192 * and cost a great deal to performances.
2195 bubble_sort_down(ftree_port_group_t ** p_group_array, uint32_t nmemb)
2199 ftree_port_group_t *tmp = p_group_array[0];
2201 /* While we did modifications on the array order */
2202 /* i may grew above array length but next loop will fail and tmp will be null for the next time
2203 * this way we save a test i < nmemb for each pass through the loop */
2204 for (i = 0; tmp; i++) {
2205 /* Assume the array is orderd */
2207 /* Comparing elements j and j-1 */
2208 for (j = 1; j < (nmemb - i); j++) {
2209 /* If they are the wrong way around */
2210 if (port_group_compare_load_down
2211 (p_group_array[j], p_group_array[j - 1]) < 0) {
2212 /* We invert them */
2213 tmp = p_group_array[j - 1];
2214 p_group_array[j - 1] = p_group_array[j];
2215 p_group_array[j] = tmp;
2222 /***************************************************
2223 ***************************************************/
2226 * Function: assign-up-going-port-by-descending-down
2227 * Given : a switch and a LID
2229 * foreach down-going-port-group (in indexing order)
2230 * skip this group if the LFT(LID) port is part of this group
2231 * find the least loaded port of the group (scan in indexing order)
2232 * r-port is the remote port connected to it
2233 * assign the remote switch node LFT(LID) to r-port
2234 * increase r-port usage counter
2235 * assign-up-going-port-by-descending-down to r-port node (recursion)
2239 fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
2240 IN ftree_sw_t * p_sw,
2241 IN ftree_sw_t * p_prev_sw,
2242 IN uint16_t target_lid,
2243 IN boolean_t is_main_path,
2244 IN boolean_t is_target_a_sw,
2245 IN uint8_t current_hops)
2247 ftree_sw_t *p_remote_sw;
2249 ftree_port_group_t *p_group;
2250 ftree_port_t *p_port;
2251 ftree_port_t *p_min_port;
2254 boolean_t created_route = FALSE;
2255 boolean_t routed = 0;
2258 /* if there is no down-going ports */
2259 if (p_sw->down_port_groups_num == 0)
2262 /* foreach down-going port group (in load order) */
2263 bubble_sort_up(p_sw->down_port_groups, p_sw->down_port_groups_num);
2265 if (p_sw->sibling_port_groups_num > 0)
2266 bubble_sort_siblings(p_sw->sibling_port_groups,
2267 p_sw->sibling_port_groups_num);
2271 (p_sw->down_port_groups_num +
2272 ((target_lid != 0) ? p_sw->sibling_port_groups_num : 0)); k++) {
2274 if (k < p_sw->down_port_groups_num) {
2275 p_group = p_sw->down_port_groups[k];
2278 p_sw->sibling_port_groups[k -
2280 down_port_groups_num];
2283 /* If this port group doesn't point to a switch, mark
2284 that the route was created and skip to the next group */
2285 if (p_group->remote_node_type != IB_NODE_TYPE_SWITCH) {
2286 created_route = TRUE;
2291 && p_group->remote_lid == p_prev_sw->lid) {
2292 /* This port group has a port that was used when we entered this switch,
2293 which means that the current group points to the switch where we were
2294 at the previous step of the algorithm (before going up).
2295 Skipping this group. */
2299 /* find the least loaded port of the group (in indexing order) */
2301 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
2305 for (j = 0; j < ports_num; j++) {
2306 cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
2307 /* first port that we're checking - set as port with the lowest load */
2308 /* or this port is less loaded - use it as min */
2310 p_port->counter_up < p_min_port->counter_up)
2311 p_min_port = p_port;
2313 /* At this point we have selected a port in this group with the
2314 lowest load of upgoing routes.
2315 Set on the remote switch how to get to the target_lid -
2316 set LFT(target_lid) on the remote switch to the remote port */
2317 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2318 least_hops = sw_get_least_hops(p_remote_sw, target_lid);
2320 if (least_hops != OSM_NO_PATH) {
2321 /* Loop in the fabric - we already routed the remote switch
2322 on our way UP, and now we see it again on our way DOWN */
2323 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2324 "Loop of length %d in the fabric:\n "
2325 "Switch %s (LID %u) closes loop through switch %s (LID %u)\n",
2327 tuple_to_str(p_remote_sw->tuple),
2329 tuple_to_str(p_sw->tuple),
2330 p_group->remote_lid);
2331 /* We skip only if we have come through a longer path */
2332 if (current_hops + 1 >= least_hops)
2336 /* Four possible cases:
2338 * 1. is_main_path == TRUE:
2339 * - going DOWN(TRUE,TRUE) through ALL the groups
2340 * + promoting port counter
2341 * + setting path in remote switch fwd tbl
2342 * + setting hops in remote switch on all the ports of each group
2344 * 2. is_main_path == FALSE:
2345 * - going DOWN(TRUE,FALSE) through ALL the groups but only if
2346 * the remote (lower) switch hasn't been already configured
2347 * for this target LID (or with a longer path)
2348 * + promoting port counter
2349 * + setting path in remote switch fwd tbl if it hasn't been set yet
2350 * + setting hops in remote switch on all the ports of each group
2351 * if it hasn't been set yet
2354 /* setting fwd tbl port only */
2355 p_remote_sw->p_osm_sw->new_lft[target_lid] =
2356 p_min_port->remote_port_num;
2357 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2358 "Switch %s: set path to CA LID %u through port %u\n",
2359 tuple_to_str(p_remote_sw->tuple),
2360 target_lid, p_min_port->remote_port_num);
2362 /* On the remote switch that is pointed by the p_group,
2363 set hops for ALL the ports in the remote group. */
2365 set_hops_on_remote_sw(p_group, target_lid,
2366 current_hops + 1, is_target_a_sw);
2369 Assign upgoing ports by stepping down, starting on REMOTE switch */
2370 routed = fabric_route_upgoing_by_going_down(p_ftree, p_remote_sw, /* remote switch - used as a route-upgoing alg. start point */
2371 NULL, /* prev. position - NULL to mark that we went down and not up */
2372 target_lid, /* LID that we're routing to */
2373 is_main_path, /* whether this is path to HCA that should by tracked by counters */
2374 is_target_a_sw, /* Whether target lid is a switch or not */
2375 current_hops + 1); /* Number of hops done to this point */
2376 created_route |= routed;
2377 /* Counters are promoted only if a route toward a node is created */
2379 p_min_port->counter_up++;
2380 p_group->counter_up++;
2381 p_group->hca_or_sw.p_sw->counter_up_changed = TRUE;
2384 /* done scanning all the down-going port groups */
2386 /* if the route was created, promote the index that
2387 indicates which group should we start with when
2388 going through all the downgoing groups */
2390 p_sw->down_port_groups_idx = (p_sw->down_port_groups_idx + 1)
2391 % p_sw->down_port_groups_num;
2393 return created_route;
2394 } /* fabric_route_upgoing_by_going_down() */
2396 /***************************************************/
2399 * Function: assign-down-going-port-by-ascending-up
2400 * Given : a switch and a LID
2402 * find the least loaded port of all the upgoing groups (scan in indexing order)
2403 * assign the LFT(LID) of remote switch to that port
2404 * track that port usage
2405 * assign-up-going-port-by-descending-down on CURRENT switch
2406 * assign-down-going-port-by-ascending-up on REMOTE switch (recursion)
2410 fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
2411 IN ftree_sw_t * p_sw,
2412 IN ftree_sw_t * p_prev_sw,
2413 IN uint16_t target_lid,
2414 IN boolean_t is_main_path,
2415 IN boolean_t is_target_a_sw,
2416 IN uint16_t reverse_hop_credit,
2417 IN uint16_t reverse_hops,
2418 IN uint8_t current_hops)
2420 ftree_sw_t *p_remote_sw;
2422 ftree_port_group_t *p_group;
2423 ftree_port_t *p_port;
2424 ftree_port_group_t *p_min_group;
2425 ftree_port_t *p_min_port;
2428 boolean_t created_route = FALSE;
2429 boolean_t routed = FALSE;
2432 /* Assign upgoing ports by stepping down, starting on THIS switch */
2433 created_route = fabric_route_upgoing_by_going_down(p_ftree, p_sw, /* local switch - used as a route-upgoing alg. start point */
2434 p_prev_sw, /* switch that we went up from (NULL means that we went down) */
2435 target_lid, /* LID that we're routing to */
2436 is_main_path, /* whether this path to HCA should by tracked by counters */
2437 is_target_a_sw, /* Whether target lid is a switch or not */
2438 current_hops); /* Number of hops done up to this point */
2440 /* recursion stop condition - if it's a root switch, */
2441 if (p_sw->rank == 0) {
2442 if (reverse_hop_credit > 0) {
2443 /* We go up by going down as we have some reverse_hop_credit left */
2444 /* We use the index to scatter a bit the reverse up routes */
2445 p_sw->down_port_groups_idx =
2446 (p_sw->down_port_groups_idx +
2447 1) % p_sw->down_port_groups_num;
2448 i = p_sw->down_port_groups_idx;
2449 for (j = 0; j < p_sw->down_port_groups_num; j++) {
2451 p_group = p_sw->down_port_groups[i];
2452 i = (i + 1) % p_sw->down_port_groups_num;
2454 /* Skip this port group unless it points to a switch */
2455 if (p_group->remote_node_type !=
2456 IB_NODE_TYPE_SWITCH)
2458 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2460 created_route |= fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
2461 p_sw, /* this switch - prev. position switch for the function */
2462 target_lid, /* LID that we're routing to */
2463 is_main_path, /* whether this is path to HCA that should by tracked by counters */
2464 is_target_a_sw, /* Whether target lid is a switch or not */
2465 reverse_hop_credit - 1, /* Remaining reverse_hops allowed */
2466 reverse_hops + 1, /* Number of reverse_hops done up to this point */
2473 return created_route;
2476 /* We should generate a list of port sorted by load so we can find easily the least
2477 * going port and explore the other pots on secondary routes more easily (and quickly) */
2478 bubble_sort_down(p_sw->up_port_groups, p_sw->up_port_groups_num);
2480 p_min_group = p_sw->up_port_groups[0];
2481 /* Find the least loaded upgoing port in the selected group */
2483 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_min_group->ports);
2484 for (j = 0; j < ports_num; j++) {
2485 cl_ptr_vector_at(&p_min_group->ports, j, (void *)&p_port);
2487 /* first port that we're checking - use
2488 it as a port with the lowest load */
2489 p_min_port = p_port;
2490 } else if (p_port->counter_down < p_min_port->counter_down) {
2491 /* this port is less loaded - use it as min */
2492 p_min_port = p_port;
2496 /* At this point we have selected a group and port with the
2497 lowest load of downgoing routes.
2498 Set on the remote switch how to get to the target_lid -
2499 set LFT(target_lid) on the remote switch to the remote port */
2500 p_remote_sw = p_min_group->remote_hca_or_sw.p_sw;
2502 /* Four possible cases:
2504 * 1. is_main_path == TRUE:
2505 * - going UP(TRUE,TRUE) on selected min_group and min_port
2506 * + promoting port counter
2507 * + setting path in remote switch fwd tbl
2508 * + setting hops in remote switch on all the ports of selected group
2509 * - going UP(TRUE,FALSE) on rest of the groups, each time on port 0
2510 * + NOT promoting port counter
2511 * + setting path in remote switch fwd tbl if it hasn't been set yet
2512 * + setting hops in remote switch on all the ports of each group
2513 * if it hasn't been set yet
2515 * 2. is_main_path == FALSE:
2516 * - going UP(TRUE,FALSE) on ALL the groups, each time on port 0,
2517 * but only if the remote (upper) switch hasn't been already
2518 * configured for this target LID
2519 * + NOT promoting port counter
2520 * + setting path in remote switch fwd tbl if it hasn't been set yet
2521 * + setting hops in remote switch on all the ports of each group
2522 * if it hasn't been set yet
2525 /* covering first half of case 1, and case 3 */
2527 if (p_sw->is_leaf) {
2528 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2529 " - Routing MAIN path for %s CA LID %u: %s --> %s\n",
2530 (target_lid != 0) ? "real" : "DUMMY",
2532 tuple_to_str(p_sw->tuple),
2533 tuple_to_str(p_remote_sw->tuple));
2535 /* The number of downgoing routes is tracked in the
2536 p_group->counter_down p_port->counter_down counters of the
2537 group and port that belong to the lower side of the link
2538 (on switch with higher rank) */
2539 p_min_group->counter_down++;
2540 p_min_port->counter_down++;
2541 if (p_min_group->counter_down ==
2542 (p_min_group->remote_hca_or_sw.p_sw->min_counter_down +
2544 recalculate_min_counter_down
2545 (p_min_group->remote_hca_or_sw.p_sw);
2548 /* This LID may already be in the LFT in the reverse_hop feature is used */
2549 /* We update the LFT only if this LID isn't already present. */
2551 /* skip if target lid has been already set on remote switch fwd tbl (with a bigger hop count) */
2552 if ((p_remote_sw->p_osm_sw->new_lft[target_lid] == OSM_NO_PATH)
2555 sw_get_least_hops(p_remote_sw, target_lid))) {
2557 p_remote_sw->p_osm_sw->new_lft[target_lid] =
2558 p_min_port->remote_port_num;
2559 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2560 "Switch %s: set path to CA LID %u through port %u\n",
2561 tuple_to_str(p_remote_sw->tuple),
2563 p_min_port->remote_port_num);
2565 /* On the remote switch that is pointed by the min_group,
2566 set hops for ALL the ports in the remote group. */
2568 set_hops_on_remote_sw(p_min_group, target_lid,
2572 /* Recursion step: Assign downgoing ports by stepping up, starting on REMOTE switch. */
2573 created_route |= fabric_route_downgoing_by_going_up(p_ftree,
2574 p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
2575 p_sw, /* this switch - prev. position switch for the function */
2576 target_lid, /* LID that we're routing to */
2577 is_main_path, /* whether this is path to HCA that should by tracked by counters */
2578 is_target_a_sw, /* Whether target lid is a switch or not */
2579 reverse_hop_credit, /* Remaining reverse_hops allowed */
2580 reverse_hops, /* Number of reverse_hops done up to this point */
2584 /* What's left to do at this point:
2586 * 1. is_main_path == TRUE:
2587 * - going UP(TRUE,FALSE) on rest of the groups, each time on port 0,
2588 * but only if the remote (upper) switch hasn't been already
2589 * configured for this target LID
2590 * + NOT promoting port counter
2591 * + setting path in remote switch fwd tbl if it hasn't been set yet
2592 * + setting hops in remote switch on all the ports of each group
2593 * if it hasn't been set yet
2595 * 2. is_main_path == FALSE:
2596 * - going UP(TRUE,FALSE) on ALL the groups, each time on port 0,
2597 * but only if the remote (upper) switch hasn't been already
2598 * configured for this target LID
2599 * + NOT promoting port counter
2600 * + setting path in remote switch fwd tbl if it hasn't been set yet
2601 * + setting hops in remote switch on all the ports of each group
2602 * if it hasn't been set yet
2604 * These two rules can be rephrased this way:
2605 * - foreach UP port group
2606 * + if remote switch has been set with the target LID
2607 * - skip this port group
2610 * - do NOT promote port counter
2611 * - set path in remote switch fwd tbl
2612 * - set hops in remote switch on all the ports of this group
2613 * - go UP(TRUE,FALSE) to the remote switch
2616 for (i = is_main_path ? 1 : 0; i < p_sw->up_port_groups_num; i++) {
2617 p_group = p_sw->up_port_groups[i];
2618 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2620 /* skip if target lid has been already set on remote switch fwd tbl (with a bigger hop count) */
2621 if (p_remote_sw->p_osm_sw->new_lft[target_lid] != OSM_NO_PATH)
2622 if (current_hops + 1 >=
2623 sw_get_least_hops(p_remote_sw, target_lid))
2626 if (p_sw->is_leaf) {
2627 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2628 " - Routing SECONDARY path for LID %u: %s --> %s\n",
2630 tuple_to_str(p_sw->tuple),
2631 tuple_to_str(p_remote_sw->tuple));
2634 /* Routing REAL lids on SECONDARY path means routing
2635 switch-to-switch or switch-to-CA paths.
2636 We can safely assume that switch will initiate very
2637 few traffic, so there's no point wasting runtime on
2638 trying to balance these routes - always pick port 0. */
2640 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
2643 for (j = 0; j < ports_num; j++) {
2644 cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
2646 /* first port that we're checking - use
2647 it as a port with the lowest load */
2648 p_min_port = p_port;
2649 } else if (p_port->counter_down <
2650 p_min_port->counter_down) {
2651 /* this port is less loaded - use it as min */
2652 p_min_port = p_port;
2656 p_port = p_min_port;
2657 p_remote_sw->p_osm_sw->new_lft[target_lid] =
2658 p_port->remote_port_num;
2660 /* On the remote switch that is pointed by the p_group,
2661 set hops for ALL the ports in the remote group. */
2663 set_hops_on_remote_sw(p_group, target_lid,
2664 current_hops + 1, is_target_a_sw);
2667 Assign downgoing ports by stepping up, starting on REMOTE switch. */
2668 routed = fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
2669 p_sw, /* this switch - prev. position switch for the function */
2670 target_lid, /* LID that we're routing to */
2671 FALSE, /* whether this is path to HCA that should by tracked by counters */
2672 is_target_a_sw, /* Whether target lid is a switch or not */
2673 reverse_hop_credit, /* Remaining reverse_hops allowed */
2674 reverse_hops, /* Number of reverse_hops done up to this point */
2676 created_route |= routed;
2679 /* Now doing the same thing with horizontal links */
2680 if (p_sw->sibling_port_groups_num > 0)
2681 bubble_sort_down(p_sw->sibling_port_groups,
2682 p_sw->sibling_port_groups_num);
2684 for (i = 0; i < p_sw->sibling_port_groups_num; i++) {
2685 p_group = p_sw->sibling_port_groups[i];
2686 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2688 /* skip if target lid has been already set on remote switch fwd tbl (with a bigger hop count) */
2689 if (p_remote_sw->p_osm_sw->new_lft[target_lid] != OSM_NO_PATH)
2690 if (current_hops + 1 >=
2691 sw_get_least_hops(p_remote_sw, target_lid))
2694 if (p_sw->is_leaf) {
2695 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2696 " - Routing SECONDARY path for LID %u: %s --> %s\n",
2698 tuple_to_str(p_sw->tuple),
2699 tuple_to_str(p_remote_sw->tuple));
2702 /* Routing REAL lids on SECONDARY path means routing
2703 switch-to-switch or switch-to-CA paths.
2704 We can safely assume that switch will initiate very
2705 few traffic, so there's no point wasting runtime on
2706 trying to balance these routes - always pick port 0. */
2709 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
2710 for (j = 0; j < ports_num; j++) {
2711 cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
2713 /* first port that we're checking - use
2714 it as a port with the lowest load */
2715 p_min_port = p_port;
2716 } else if (p_port->counter_down <
2717 p_min_port->counter_down) {
2718 /* this port is less loaded - use it as min */
2719 p_min_port = p_port;
2723 p_port = p_min_port;
2724 p_remote_sw->p_osm_sw->new_lft[target_lid] =
2725 p_port->remote_port_num;
2727 /* On the remote switch that is pointed by the p_group,
2728 set hops for ALL the ports in the remote group. */
2730 set_hops_on_remote_sw(p_group, target_lid,
2731 current_hops + 1, is_target_a_sw);
2734 Assign downgoing ports by stepping up, starting on REMOTE switch. */
2735 routed = fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
2736 p_sw, /* this switch - prev. position switch for the function */
2737 target_lid, /* LID that we're routing to */
2738 FALSE, /* whether this is path to HCA that should by tracked by counters */
2739 is_target_a_sw, /* Whether target lid is a switch or not */
2740 reverse_hop_credit, /* Remaining reverse_hops allowed */
2741 reverse_hops, /* Number of reverse_hops done up to this point */
2743 created_route |= routed;
2745 p_min_group->counter_down++;
2746 p_min_port->counter_down++;
2750 /* If we don't have any reverse hop credits, we are done */
2751 if (reverse_hop_credit == 0)
2752 return created_route;
2755 return created_route;
2757 /* We explore all the down group ports */
2758 /* We try to reverse jump for each of them */
2759 /* They already have a route to us from the upgoing_by_going_down started earlier */
2760 /* This is only so it'll continue exploring up, after this step backwards */
2761 for (i = 0; i < p_sw->down_port_groups_num; i++) {
2762 p_group = p_sw->down_port_groups[i];
2763 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2765 /* Skip this port group unless it points to a switch */
2766 if (p_group->remote_node_type != IB_NODE_TYPE_SWITCH)
2770 Assign downgoing ports by stepping up, fter doing one step down starting on REMOTE switch. */
2771 created_route |= fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
2772 p_sw, /* this switch - prev. position switch for the function */
2773 target_lid, /* LID that we're routing to */
2774 TRUE, /* whether this is path to HCA that should by tracked by counters */
2775 is_target_a_sw, /* Whether target lid is a switch or not */
2776 reverse_hop_credit - 1, /* Remaining reverse_hops allowed */
2777 reverse_hops + 1, /* Number of reverse_hops done up to this point */
2781 return created_route;
2783 } /* ftree_fabric_route_downgoing_by_going_up() */
2785 /***************************************************/
2789 * foreach leaf switch (in indexing order)
2790 * for each compute node (in indexing order)
2791 * obtain the LID of the compute node
2792 * set local LFT(LID) of the port connecting to compute node
2793 * call assign-down-going-port-by-ascending-up(TRUE,TRUE) on CURRENT switch
2794 * for each MISSING compute node
2795 * call assign-down-going-port-by-ascending-up(FALSE,TRUE) on CURRENT switch
2798 static void fabric_route_to_cns(IN ftree_fabric_t * p_ftree)
2802 ftree_port_group_t *p_leaf_port_group;
2803 ftree_port_group_t *p_hca_port_group;
2804 ftree_port_t *p_port;
2807 unsigned routed_targets_on_leaf;
2809 OSM_LOG_ENTER(&p_ftree->p_osm->log);
2811 /* for each leaf switch (in indexing order) */
2812 for (i = 0; i < p_ftree->leaf_switches_num; i++) {
2813 p_sw = p_ftree->leaf_switches[i];
2814 routed_targets_on_leaf = 0;
2816 /* for each HCA connected to this switch */
2817 for (j = 0; j < p_sw->down_port_groups_num; j++) {
2818 p_leaf_port_group = p_sw->down_port_groups[j];
2820 /* work with this port group only if the remote node is CA */
2821 if (p_leaf_port_group->remote_node_type !=
2825 p_hca = p_leaf_port_group->remote_hca_or_sw.p_hca;
2827 /* work with this port group only if remote HCA has CNs */
2832 hca_get_port_group_by_lid(p_hca,
2835 CL_ASSERT(p_hca_port_group);
2837 /* work with this port group only if remote port is CN */
2838 if (!p_hca_port_group->is_cn)
2841 /* obtain the LID of HCA port */
2842 hca_lid = p_leaf_port_group->remote_lid;
2844 /* set local LFT(LID) to the port that is connected to HCA */
2845 cl_ptr_vector_at(&p_leaf_port_group->ports, 0,
2847 p_sw->p_osm_sw->new_lft[hca_lid] = p_port->port_num;
2849 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2850 "Switch %s: set path to CN LID %u through port %u\n",
2851 tuple_to_str(p_sw->tuple),
2852 hca_lid, p_port->port_num);
2854 /* set local min hop table(LID) to route to the CA */
2855 sw_set_hops(p_sw, hca_lid, p_port->port_num, 1, FALSE);
2857 /* Assign downgoing ports by stepping up.
2858 Since we're routing here only CNs, we're routing it as REAL
2859 LID and updating fat-tree balancing counters. */
2860 fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */
2861 NULL, /* prev. position switch */
2862 hca_lid, /* LID that we're routing to */
2863 TRUE, /* whether this path to HCA should by tracked by counters */
2864 FALSE, /* whether target lid is a switch or not */
2865 0, /* Number of reverse hops allowed */
2866 0, /* Number of reverse hops done yet */
2867 1); /* Number of hops done yet */
2869 /* count how many real targets have been routed from this leaf switch */
2870 routed_targets_on_leaf++;
2873 /* We're done with the real targets (all CNs) of this leaf switch.
2874 Now route the dummy HCAs that are missing or that are non-CNs.
2875 When routing to dummy HCAs we don't fill lid matrices. */
2876 if (p_ftree->max_cn_per_leaf > routed_targets_on_leaf) {
2877 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2878 "Routing %u dummy CAs\n",
2879 p_ftree->max_cn_per_leaf -
2880 p_sw->down_port_groups_num);
2882 p_ftree->max_cn_per_leaf - routed_targets_on_leaf;
2884 ftree_sw_t *p_next_sw, *p_ftree_sw;
2885 sw_set_hops(p_sw, 0, 0xFF, 1, FALSE);
2886 /* assign downgoing ports by stepping up */
2887 fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */
2888 NULL, /* prev. position switch */
2889 0, /* LID that we're routing to - ignored for dummy HCA */
2890 TRUE, /* whether this path to HCA should by tracked by counters */
2891 FALSE, /* Whether the target LID is a switch or not */
2892 0, /* Number of reverse hops allowed */
2893 0, /* Number of reverse hops done yet */
2894 1); /* Number of hops done yet */
2896 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
2897 /* need to clean the LID 0 hops for dummy node */
2898 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
2899 p_ftree_sw = p_next_sw;
2900 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_ftree_sw->map_item);
2901 p_ftree_sw->hops[0] = OSM_NO_PATH;
2902 p_ftree_sw->p_osm_sw->new_lft[0] = OSM_NO_PATH;
2908 /* done going through all the leaf switches */
2909 OSM_LOG_EXIT(&p_ftree->p_osm->log);
2910 } /* fabric_route_to_cns() */
2912 /***************************************************/
2916 * foreach HCA non-CN port in fabric
2917 * obtain the LID of the HCA port
2918 * get switch that is connected to this HCA port
2919 * set switch LFT(LID) to the port connected to the HCA port
2920 * call assign-down-going-port-by-ascending-up(TRUE,TRUE) on the switch
2922 * Routing to these HCAs is routing a REAL hca lid on MAIN path.
2923 * We want to allow load-leveling of the traffic to the non-CNs,
2924 * because such nodes may include IO nodes with heavy usage
2925 * - we should set fwd tables
2926 * - we should update port counters
2927 * Routing to non-CNs is done after routing to CNs, so updated port
2928 * counters will not affect CN-to-CN routing.
2931 static void fabric_route_to_non_cns(IN ftree_fabric_t * p_ftree)
2935 ftree_hca_t *p_next_hca;
2936 ftree_port_t *p_hca_port;
2937 ftree_port_group_t *p_hca_port_group;
2939 unsigned port_num_on_switch;
2942 OSM_LOG_ENTER(&p_ftree->p_osm->log);
2944 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
2945 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
2947 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
2949 for (i = 0; i < p_hca->up_port_groups_num; i++) {
2950 p_hca_port_group = p_hca->up_port_groups[i];
2952 /* skip this port if it's CN, in which case it has been already routed */
2953 if (p_hca_port_group->is_cn)
2956 /* skip this port if it is not connected to switch */
2957 if (p_hca_port_group->remote_node_type !=
2958 IB_NODE_TYPE_SWITCH)
2961 p_sw = p_hca_port_group->remote_hca_or_sw.p_sw;
2962 hca_lid = p_hca_port_group->lid;
2964 /* set switches LFT(LID) to the port that is connected to HCA */
2965 cl_ptr_vector_at(&p_hca_port_group->ports, 0,
2966 (void *)&p_hca_port);
2967 port_num_on_switch = p_hca_port->remote_port_num;
2968 p_sw->p_osm_sw->new_lft[hca_lid] = port_num_on_switch;
2970 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2971 "Switch %s: set path to non-CN HCA LID %u through port %u\n",
2972 tuple_to_str(p_sw->tuple),
2973 hca_lid, port_num_on_switch);
2975 /* set local min hop table(LID) to route to the CA */
2976 sw_set_hops(p_sw, hca_lid, port_num_on_switch, /* port num */
2977 1, FALSE); /* hops */
2979 /* Assign downgoing ports by stepping up.
2980 We're routing REAL targets. They are not CNs and not included
2981 in the leafs array, but we treat them as MAIN path to allow load
2982 leveling, which means that the counters will be updated. */
2983 fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */
2984 NULL, /* prev. position switch */
2985 hca_lid, /* LID that we're routing to */
2986 TRUE, /* whether this path to HCA should by tracked by counters */
2987 FALSE, /* Whether the target LID is a switch or not */
2988 p_hca_port_group->is_io ? p_ftree->p_osm->subn.opt.max_reverse_hops : 0, /* Number or reverse hops allowed */
2989 0, /* Number or reverse hops done yet */
2990 1); /* Number of hops done yet */
2992 /* done with all the port groups of this HCA - go to next HCA */
2995 OSM_LOG_EXIT(&p_ftree->p_osm->log);
2996 } /* fabric_route_to_non_cns() */
2998 /***************************************************/
3002 * foreach switch in fabric
3004 * set local LFT(LID) to port 0
3005 * call assign-down-going-port-by-ascending-up(TRUE,FALSE) on CURRENT switch
3007 * Routing to switch is similar to routing a REAL hca lid on SECONDARY path:
3008 * - we should set fwd tables
3009 * - we should NOT update port counters
3012 static void fabric_route_to_switches(IN ftree_fabric_t * p_ftree)
3015 ftree_sw_t *p_next_sw;
3017 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3019 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3020 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
3022 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
3024 /* set local LFT(LID) to 0 (route to itself) */
3025 p_sw->p_osm_sw->new_lft[p_sw->lid] = 0;
3027 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3028 "Switch %s (LID %u): routing switch-to-switch paths\n",
3029 tuple_to_str(p_sw->tuple), p_sw->lid);
3031 /* set min hop table of the switch to itself */
3032 sw_set_hops(p_sw, p_sw->lid, 0, /* port_num */
3033 0, TRUE); /* hops */
3035 fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */
3036 NULL, /* prev. position switch */
3037 p_sw->lid, /* LID that we're routing to */
3038 FALSE, /* whether this path to HCA should by tracked by counters */
3039 TRUE, /* Whether the target LID is a switch or not */
3040 0, /* Number of reverse hops allowed */
3041 0, /* Number of reverse hops done yet */
3042 0); /* Number of hops done yet */
3045 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3046 } /* fabric_route_to_switches() */
3048 /***************************************************
3049 ***************************************************/
3051 static void fabric_route_roots(IN ftree_fabric_t * p_ftree)
3057 ftree_sw_t *p_leaf_sw;
3059 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3062 * We need a switch that will accomodate all the down/up turns in
3063 * the fabric. Having these turn in a single place in the fabric
3064 * will not create credit loops.
3065 * So we need to select this switch.
3066 * The idea here is to chose leaf with the highest index. I don't
3067 * have any theory to back me up on this. It's just a general thought
3068 * that this way the switch that might be a bottleneck for many mcast
3069 * groups will be far away from the OpenSM, so it will draw the
3070 * multicast traffic away from the SM.
3073 p_leaf_sw = p_ftree->leaf_switches[p_ftree->leaf_switches_num-1];
3076 * Now go over all the switches in the fabric that
3077 * have lower rank, and route the missing LIDs to
3078 * the selected leaf switch.
3079 * In short, this leaf switch now poses a target
3080 * for all those missing LIDs.
3083 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3084 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
3085 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
3087 if (p_sw->rank >= p_ftree->leaf_switch_rank)
3090 for (lid = 1; lid <= p_leaf_sw->p_osm_sw->max_lid_ho; lid ++) {
3092 if (p_sw->p_osm_sw->new_lft[lid] != OSM_NO_PATH ||
3093 p_leaf_sw->hops[lid] == OSM_NO_PATH)
3096 p_port = osm_get_port_by_lid_ho(&p_ftree->p_osm->subn,
3099 /* we're interested only in switches */
3100 if (!p_port || !p_port->p_node->sw)
3104 * the missing LID will be routed through the same
3105 * port that routes to the selected leaf switch
3107 port_num = p_sw->p_osm_sw->new_lft[p_leaf_sw->lid];
3109 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3110 "Switch %s: setting path to LID %u "
3111 "through port %u\n",
3112 tuple_to_str(p_sw->tuple), lid, port_num);
3115 p_sw->p_osm_sw->new_lft[lid] = port_num;
3118 * Set local min hop table.
3119 * The distance to the target LID is a distance
3120 * to the selected leaf switch plus the distance
3121 * from the leaf to the target LID.
3123 sw_set_hops(p_sw, lid, port_num,
3124 p_sw->hops[p_leaf_sw->lid] +
3125 p_leaf_sw->hops[lid], TRUE);
3129 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3130 } /* fabric_route_roots() */
3132 /***************************************************/
3134 static int fabric_populate_nodes(IN ftree_fabric_t * p_ftree)
3136 osm_node_t *p_osm_node;
3137 osm_node_t *p_next_osm_node;
3139 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3142 (osm_node_t *) cl_qmap_head(&p_ftree->p_osm->subn.node_guid_tbl);
3143 while (p_next_osm_node !=
3144 (osm_node_t *) cl_qmap_end(&p_ftree->p_osm->
3145 subn.node_guid_tbl)) {
3146 p_osm_node = p_next_osm_node;
3148 (osm_node_t *) cl_qmap_next(&p_osm_node->map_item);
3149 switch (osm_node_get_type(p_osm_node)) {
3150 case IB_NODE_TYPE_CA:
3151 fabric_add_hca(p_ftree, p_osm_node);
3153 case IB_NODE_TYPE_ROUTER:
3155 case IB_NODE_TYPE_SWITCH:
3156 fabric_add_sw(p_ftree, p_osm_node->sw);
3159 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3160 "ERR AB0E: " "Node GUID 0x%016" PRIx64
3161 " - Unknown node type: %s\n",
3162 cl_ntoh64(osm_node_get_node_guid(p_osm_node)),
3163 ib_get_node_type_str(osm_node_get_type
3165 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3170 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3172 } /* fabric_populate_nodes() */
3174 /***************************************************
3175 ***************************************************/
3177 static boolean_t sw_update_rank(IN ftree_sw_t * p_sw, IN uint32_t new_rank)
3179 if (sw_ranked(p_sw) && p_sw->rank <= new_rank)
3181 p_sw->rank = new_rank;
3186 /***************************************************/
3188 static void rank_switches_from_leafs(IN ftree_fabric_t * p_ftree,
3189 IN cl_list_t * p_ranking_bfs_list)
3192 ftree_sw_t *p_remote_sw;
3194 osm_node_t *p_remote_node;
3195 osm_physp_t *p_osm_port;
3197 unsigned max_rank = 0;
3199 while (!cl_is_list_empty(p_ranking_bfs_list)) {
3200 p_sw = (ftree_sw_t *) cl_list_remove_head(p_ranking_bfs_list);
3201 p_node = p_sw->p_osm_sw->p_node;
3203 /* note: skipping port 0 on switches */
3204 for (i = 1; i < osm_node_get_num_physp(p_node); i++) {
3205 p_osm_port = osm_node_get_physp_ptr(p_node, i);
3206 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
3210 osm_node_get_remote_node(p_node, i, NULL);
3213 if (osm_node_get_type(p_remote_node) !=
3214 IB_NODE_TYPE_SWITCH)
3217 p_remote_sw = fabric_get_sw_by_guid(p_ftree,
3218 osm_node_get_node_guid
3221 /* remote node is not a switch */
3225 /* if needed, rank the remote switch and add it to the BFS list */
3226 if (sw_update_rank(p_remote_sw, p_sw->rank + 1)) {
3227 max_rank = p_remote_sw->rank;
3228 cl_list_insert_tail(p_ranking_bfs_list,
3234 /* set FatTree maximal switch rank */
3235 p_ftree->max_switch_rank = max_rank;
3237 } /* rank_switches_from_leafs() */
3239 /***************************************************/
3241 static int rank_leaf_switches(IN ftree_fabric_t * p_ftree,
3242 IN ftree_hca_t * p_hca,
3243 IN cl_list_t * p_ranking_bfs_list)
3246 osm_node_t *p_osm_node = p_hca->p_osm_node;
3247 osm_node_t *p_remote_osm_node;
3248 osm_physp_t *p_osm_port;
3249 static uint8_t i = 0;
3252 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3254 for (i = 0; i < osm_node_get_num_physp(p_osm_node); i++) {
3255 p_osm_port = osm_node_get_physp_ptr(p_osm_node, i);
3256 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
3260 osm_node_get_remote_node(p_osm_node, i, NULL);
3261 if (!p_remote_osm_node)
3264 switch (osm_node_get_type(p_remote_osm_node)) {
3265 case IB_NODE_TYPE_CA:
3266 /* HCA connected directly to another HCA - not FatTree */
3267 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3269 "CA conected directly to another CA: " "0x%016"
3270 PRIx64 " <---> 0x%016" PRIx64 "\n",
3271 hca_get_guid_ho(p_hca),
3272 cl_ntoh64(osm_node_get_node_guid
3273 (p_remote_osm_node)));
3277 case IB_NODE_TYPE_ROUTER:
3278 /* leaving this port - proceeding to the next one */
3281 case IB_NODE_TYPE_SWITCH:
3282 /* continue with this port */
3286 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3287 "ERR AB10: Node GUID 0x%016" PRIx64
3288 " - Unknown node type: %s\n",
3289 cl_ntoh64(osm_node_get_node_guid
3290 (p_remote_osm_node)),
3291 ib_get_node_type_str(osm_node_get_type
3292 (p_remote_osm_node)));
3297 /* remote node is switch */
3299 p_sw = fabric_get_sw_by_guid(p_ftree,
3300 osm_node_get_node_guid
3301 (p_osm_port->p_remote_physp->
3305 /* if needed, rank the remote switch and add it to the BFS list */
3307 if (!sw_update_rank(p_sw, 0))
3309 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3310 "Marking rank of switch that is directly connected to CA:\n"
3311 " - CA guid : 0x%016"
3313 " - Switch guid: 0x%016"
3315 " - Switch LID : %u\n",
3316 hca_get_guid_ho(p_hca),
3317 sw_get_guid_ho(p_sw), p_sw->lid);
3318 cl_list_insert_tail(p_ranking_bfs_list, p_sw);
3322 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3324 } /* rank_leaf_switches() */
3326 /***************************************************/
3328 static void sw_reverse_rank(IN cl_map_item_t * const p_map_item,
3331 ftree_fabric_t *p_ftree = (ftree_fabric_t *) context;
3332 ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item;
3333 if (p_sw->rank != 0xFFFFFFFF)
3334 p_sw->rank = p_ftree->max_switch_rank - p_sw->rank;
3337 /***************************************************
3338 ***************************************************/
3341 fabric_construct_hca_ports(IN ftree_fabric_t * p_ftree, IN ftree_hca_t * p_hca)
3343 ftree_sw_t *p_remote_sw;
3344 osm_node_t *p_node = p_hca->p_osm_node;
3345 osm_node_t *p_remote_node;
3346 uint8_t remote_node_type;
3347 ib_net64_t remote_node_guid;
3348 osm_physp_t *p_remote_osm_port;
3350 uint8_t remote_port_num;
3352 boolean_t is_in_cn_file;
3354 boolean_t is_cns_file_provided = fabric_cns_provided(p_ftree);
3355 boolean_t is_ios_file_provided = fabric_ios_provided(p_ftree);
3358 for (i = 0; i < osm_node_get_num_physp(p_node); i++) {
3359 osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i);
3362 is_in_cn_file = FALSE;
3364 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
3367 if (p_hca->disconnected_ports[i])
3370 p_remote_osm_port = osm_physp_get_remote(p_osm_port);
3372 osm_node_get_remote_node(p_node, i, &remote_port_num);
3374 if (!p_remote_osm_port || !p_remote_node)
3377 remote_node_type = osm_node_get_type(p_remote_node);
3378 remote_node_guid = osm_node_get_node_guid(p_remote_node);
3380 switch (remote_node_type) {
3381 case IB_NODE_TYPE_ROUTER:
3382 /* leaving this port - proceeding to the next one */
3385 case IB_NODE_TYPE_CA:
3386 /* HCA connected directly to another HCA - not FatTree */
3387 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3389 "CA conected directly to another CA: " "0x%016"
3390 PRIx64 " <---> 0x%016" PRIx64 "\n",
3391 cl_ntoh64(osm_node_get_node_guid(p_node)),
3392 cl_ntoh64(remote_node_guid));
3396 case IB_NODE_TYPE_SWITCH:
3397 /* continue with this port */
3401 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3402 "ERR AB12: Node GUID 0x%016" PRIx64
3403 " - Unknown node type: %s\n",
3404 cl_ntoh64(remote_node_guid),
3405 ib_get_node_type_str(remote_node_type));
3410 /* remote node is switch */
3412 p_remote_sw = fabric_get_sw_by_guid(p_ftree, remote_node_guid);
3413 CL_ASSERT(p_remote_sw);
3415 /* If CN file is not supplied, then all the CAs considered as Compute Nodes.
3416 Otherwise all the CAs are not CNs, and only guids that are present in the
3417 CN file will be marked as compute nodes. */
3418 if (is_cns_file_provided == TRUE) {
3419 name_map_item_t *p_elem = (name_map_item_t *)
3420 cl_qmap_get(&p_ftree->cn_guid_tbl,
3421 cl_ntoh64(osm_physp_get_port_guid
3423 if (p_elem == (name_map_item_t *)
3424 cl_qmap_end(&p_ftree->cn_guid_tbl))
3427 is_in_cn_file = TRUE;
3429 if (is_in_cn_file == FALSE && is_ios_file_provided == TRUE) {
3430 name_map_item_t *p_elem = (name_map_item_t *)
3431 cl_qmap_get(&p_ftree->io_guid_tbl,
3432 cl_ntoh64(osm_physp_get_port_guid
3434 if (p_elem != (name_map_item_t *)
3435 cl_qmap_end(&p_ftree->io_guid_tbl)) {
3444 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3445 "Marking CN port GUID 0x%016" PRIx64 "\n",
3446 cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
3448 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3449 "Marking I/O port GUID 0x%016" PRIx64 "\n",
3450 cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
3452 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3453 "Marking non-CN port GUID 0x%016" PRIx64 "\n",
3454 cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
3456 p_ftree->ca_ports++;
3458 hca_add_port(p_ftree,
3459 p_hca, /* local ftree_hca object */
3460 i, /* local port number */
3461 remote_port_num, /* remote port number */
3462 cl_ntoh16(osm_node_get_base_lid(p_node, i)), /* local lid */
3463 cl_ntoh16(osm_node_get_base_lid(p_remote_node, 0)), /* remote lid */
3464 osm_physp_get_port_guid(p_osm_port), /* local port guid */
3465 osm_physp_get_port_guid(p_remote_osm_port), /* remote port guid */
3466 remote_node_guid, /* remote node guid */
3467 remote_node_type, /* remote node type */
3468 (void *)p_remote_sw, /* remote ftree_hca/sw object */
3469 is_cn, is_io); /* whether this port is compute node */
3474 } /* fabric_construct_hca_ports() */
3476 /***************************************************
3477 ***************************************************/
3479 static int fabric_construct_sw_ports(IN ftree_fabric_t * p_ftree,
3480 IN ftree_sw_t * p_sw)
3482 ftree_hca_t *p_remote_hca;
3483 ftree_sw_t *p_remote_sw;
3484 osm_node_t *p_node = p_sw->p_osm_sw->p_node;
3485 osm_node_t *p_remote_node;
3486 uint16_t remote_lid;
3487 uint8_t remote_node_type;
3488 ib_net64_t remote_node_guid;
3489 osm_physp_t *p_remote_osm_port;
3490 ftree_direction_t direction;
3491 void *p_remote_hca_or_sw;
3493 uint8_t remote_port_num;
3496 CL_ASSERT(osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH);
3498 for (i = 1; i < osm_node_get_num_physp(p_node); i++) {
3499 osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i);
3500 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
3503 p_remote_osm_port = osm_physp_get_remote(p_osm_port);
3504 if (!p_remote_osm_port)
3508 osm_node_get_remote_node(p_node, i, &remote_port_num);
3512 /* ignore any loopback connection on switch */
3513 if (p_node == p_remote_node) {
3514 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3515 "Ignoring loopback on switch GUID 0x%016" PRIx64
3516 ", LID %u, rank %u\n",
3517 sw_get_guid_ho(p_sw),
3518 p_sw->lid, p_sw->rank);
3522 remote_node_type = osm_node_get_type(p_remote_node);
3523 remote_node_guid = osm_node_get_node_guid(p_remote_node);
3525 switch (remote_node_type) {
3526 case IB_NODE_TYPE_ROUTER:
3527 /* leaving this port - proceeding to the next one */
3530 case IB_NODE_TYPE_CA:
3531 /* switch connected to hca */
3534 fabric_get_hca_by_guid(p_ftree, remote_node_guid);
3535 CL_ASSERT(p_remote_hca);
3537 p_remote_hca_or_sw = (void *)p_remote_hca;
3538 direction = FTREE_DIRECTION_DOWN;
3541 cl_ntoh16(osm_physp_get_base_lid(p_remote_osm_port));
3544 case IB_NODE_TYPE_SWITCH:
3545 /* switch connected to another switch */
3548 fabric_get_sw_by_guid(p_ftree, remote_node_guid);
3549 CL_ASSERT(p_remote_sw);
3551 p_remote_hca_or_sw = (void *)p_remote_sw;
3553 if (p_sw->rank > p_remote_sw->rank) {
3554 direction = FTREE_DIRECTION_UP;
3555 } else if (p_sw->rank == p_remote_sw->rank) {
3556 direction = FTREE_DIRECTION_SAME;
3558 direction = FTREE_DIRECTION_DOWN;
3560 /* switch LID is only in port 0 port_info structure */
3562 cl_ntoh16(osm_node_get_base_lid(p_remote_node, 0));
3567 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3568 "ERR AB13: Node GUID 0x%016" PRIx64
3569 " - Unknown node type: %s\n",
3570 cl_ntoh64(remote_node_guid),
3571 ib_get_node_type_str(remote_node_type));
3575 sw_add_port(p_sw, /* local ftree_sw object */
3576 i, /* local port number */
3577 remote_port_num, /* remote port number */
3578 p_sw->lid, /* local lid */
3579 remote_lid, /* remote lid */
3580 osm_physp_get_port_guid(p_osm_port), /* local port guid */
3581 osm_physp_get_port_guid(p_remote_osm_port), /* remote port guid */
3582 remote_node_guid, /* remote node guid */
3583 remote_node_type, /* remote node type */
3584 p_remote_hca_or_sw, /* remote ftree_hca/sw object */
3585 direction); /* port direction (up or down) */
3587 /* Track the max lid (in host order) that exists in the fabric */
3588 if (remote_lid > p_ftree->lft_max_lid)
3589 p_ftree->lft_max_lid = remote_lid;
3594 } /* fabric_construct_sw_ports() */
3596 /***************************************************
3597 ***************************************************/
3598 struct rank_root_cxt {
3599 ftree_fabric_t *fabric;
3602 /***************************************************
3603 ***************************************************/
3604 static int rank_root_sw_by_guid(void *cxt, uint64_t guid, char *p)
3606 struct rank_root_cxt *c = cxt;
3609 sw = fabric_get_sw_by_guid(c->fabric, cl_hton64(guid));
3611 /* the specified root guid wasn't found in the fabric */
3612 OSM_LOG(&c->fabric->p_osm->log, OSM_LOG_ERROR, "ERR AB24: "
3613 "Root switch GUID 0x%" PRIx64 " not found\n", guid);
3617 OSM_LOG(&c->fabric->p_osm->log, OSM_LOG_DEBUG,
3618 "Ranking root switch with GUID 0x%" PRIx64 "\n", guid);
3620 cl_list_insert_tail(c->list, sw);
3624 /***************************************************
3625 ***************************************************/
3626 static boolean_t fabric_load_roots(IN ftree_fabric_t * p_ftree,
3627 IN cl_list_t* p_ranking_bfs_list)
3629 struct rank_root_cxt context;
3632 if (p_ranking_bfs_list) {
3634 /* Rank all the roots and add them to list */
3635 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3636 "Fetching root nodes from file %s\n",
3637 p_ftree->p_osm->subn.opt.root_guid_file);
3639 context.fabric = p_ftree;
3640 context.list = p_ranking_bfs_list;
3641 if (parse_node_map(p_ftree->p_osm->subn.opt.root_guid_file,
3642 rank_root_sw_by_guid, &context)) {
3643 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB2A: "
3644 "cannot parse root guids file \'%s\'\n",
3645 p_ftree->p_osm->subn.opt.root_guid_file);
3649 num_roots = cl_list_count(p_ranking_bfs_list);
3651 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB25: "
3652 "No valid roots supplied\n");
3656 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3657 "Ranked %u valid root switches\n", num_roots);
3661 /***************************************************
3662 ***************************************************/
3663 static int fabric_rank_from_roots(IN ftree_fabric_t * p_ftree,
3664 IN cl_list_t* p_ranking_bfs_list)
3666 osm_node_t *p_osm_node;
3667 osm_node_t *p_remote_osm_node;
3668 osm_physp_t *p_osm_physp;
3670 ftree_sw_t *p_remote_sw;
3672 unsigned max_rank = 0;
3675 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3677 if (!p_ranking_bfs_list) {
3681 while (!cl_is_list_empty(p_ranking_bfs_list)) {
3682 p_sw = (ftree_sw_t *) cl_list_remove_head(p_ranking_bfs_list);
3683 p_osm_node = p_sw->p_osm_sw->p_node;
3685 /* note: skipping port 0 on switches */
3686 for (i = 1; i < osm_node_get_num_physp(p_osm_node); i++) {
3687 p_osm_physp = osm_node_get_physp_ptr(p_osm_node, i);
3688 if (!p_osm_physp || !osm_link_is_healthy(p_osm_physp))
3692 osm_node_get_remote_node(p_osm_node, i, NULL);
3693 if (!p_remote_osm_node)
3696 if (osm_node_get_type(p_remote_osm_node) !=
3697 IB_NODE_TYPE_SWITCH)
3700 p_remote_sw = fabric_get_sw_by_guid(p_ftree,
3701 osm_node_get_node_guid
3702 (p_remote_osm_node));
3703 CL_ASSERT(p_remote_sw);
3705 /* if needed, rank the remote switch and add it to the BFS list */
3706 if (sw_update_rank(p_remote_sw, p_sw->rank + 1)) {
3707 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3708 "Ranking switch 0x%" PRIx64
3710 sw_get_guid_ho(p_remote_sw),
3712 max_rank = p_remote_sw->rank;
3713 cl_list_insert_tail(p_ranking_bfs_list,
3717 /* done with ports of this switch - go to the next switch in the list */
3720 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3721 "Subnet ranking completed. Max Node Rank = %u\n", max_rank);
3723 /* set FatTree maximal switch rank */
3724 p_ftree->max_switch_rank = max_rank;
3727 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3729 } /* fabric_rank_from_roots() */
3731 /***************************************************
3732 ***************************************************/
3734 static int fabric_rank_from_hcas(IN ftree_fabric_t * p_ftree)
3737 ftree_hca_t *p_next_hca;
3738 cl_list_t ranking_bfs_list;
3741 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3743 cl_list_init(&ranking_bfs_list, 10);
3745 /* Mark REVERSED rank of all the switches in the subnet.
3746 Start from switches that are connected to hca's, and
3747 scan all the switches in the subnet. */
3748 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3749 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3751 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3752 if (rank_leaf_switches(p_ftree, p_hca, &ranking_bfs_list) != 0) {
3754 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3756 "Subnet ranking failed - subnet is not FatTree");
3761 /* Now rank rest of the switches in the fabric, while the
3762 list already contains all the ranked leaf switches */
3763 rank_switches_from_leafs(p_ftree, &ranking_bfs_list);
3765 /* fix ranking of the switches by reversing the ranking direction */
3766 cl_qmap_apply_func(&p_ftree->sw_tbl, sw_reverse_rank, (void *)p_ftree);
3769 cl_list_destroy(&ranking_bfs_list);
3770 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3772 } /* fabric_rank_from_hcas() */
3774 /***************************************************
3775 * After ranking from HCA's we want to re-rank using
3777 ***************************************************/
3778 static int fabric_rerank_using_root(IN ftree_fabric_t * p_ftree,
3779 IN cl_list_t* p_ranking_bfs_list)
3781 ftree_sw_t *p_sw = NULL;
3782 ftree_sw_t *p_next_sw;
3785 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3787 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3788 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
3790 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
3791 if (p_sw->rank == 0)
3792 cl_list_insert_tail(p_ranking_bfs_list, p_sw);
3794 p_sw->rank = 0xFFFFFFFF;
3796 res = fabric_rank_from_roots(p_ftree, p_ranking_bfs_list);
3797 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3800 /***************************************************
3801 ***************************************************/
3802 static int fabric_rank(IN ftree_fabric_t * p_ftree)
3805 cl_list_t ranking_bfs_list;
3807 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3808 cl_list_init(&ranking_bfs_list, 10);
3810 if (fabric_roots_provided(p_ftree) &&
3811 fabric_load_roots(p_ftree, &ranking_bfs_list))
3812 res = fabric_rank_from_roots(p_ftree, &ranking_bfs_list);
3814 res = fabric_rank_from_hcas(p_ftree);
3816 res = fabric_rerank_using_root(p_ftree, &ranking_bfs_list);
3822 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3823 "FatTree max switch rank is %u\n", p_ftree->max_switch_rank);
3826 cl_list_destroy(&ranking_bfs_list);
3827 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3829 } /* fabric_rank() */
3831 /***************************************************
3832 ***************************************************/
3834 static void fabric_set_leaf_rank(IN ftree_fabric_t * p_ftree)
3838 ftree_hca_t *p_hca = NULL;
3839 ftree_hca_t *p_next_hca;
3841 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3843 if (!fabric_roots_provided(p_ftree)) {
3844 /* If root file is not provided, the fabric has to be pure fat-tree
3845 in terms of ranking. Thus, leaf switches rank is the max rank. */
3846 p_ftree->leaf_switch_rank = p_ftree->max_switch_rank;
3848 /* Find the first CN and set the leaf_switch_rank to the rank
3849 of the switch that is connected to this CN. Later we will
3850 ensure that all the leaf switches have the same rank. */
3851 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3852 while (p_next_hca !=
3853 (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3858 (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3860 /* we know that there are CNs in the fabric, so just to be sure... */
3861 CL_ASSERT(p_next_hca !=
3862 (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl));
3864 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3865 "Selected CN port GUID 0x%" PRIx64 "\n",
3866 hca_get_guid_ho(p_hca));
3868 for (i = 0; (i < p_hca->up_port_groups_num)
3869 && (!p_hca->up_port_groups[i]->is_cn); i++)
3871 CL_ASSERT(i < p_hca->up_port_groups_num);
3872 CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type ==
3873 IB_NODE_TYPE_SWITCH);
3875 p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw;
3876 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3877 "Selected leaf switch GUID 0x%" PRIx64 ", rank %u\n",
3878 sw_get_guid_ho(p_sw), p_sw->rank);
3879 p_ftree->leaf_switch_rank = p_sw->rank;
3882 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3883 "FatTree leaf switch rank is %u\n", p_ftree->leaf_switch_rank);
3884 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3885 } /* fabric_set_leaf_rank() */
3887 /***************************************************
3888 ***************************************************/
3890 static int fabric_populate_ports(IN ftree_fabric_t * p_ftree)
3893 ftree_hca_t *p_next_hca;
3895 ftree_sw_t *p_next_sw;
3898 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3900 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3901 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3903 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3904 if (fabric_construct_hca_ports(p_ftree, p_hca) != 0) {
3910 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3911 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
3913 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
3914 if (fabric_construct_sw_ports(p_ftree, p_sw) != 0) {
3920 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3922 } /* fabric_populate_ports() */
3924 /***************************************************
3925 ***************************************************/
3926 static int add_guid_item_to_map(void *cxt, uint64_t guid, char *p)
3928 cl_qmap_t *map = cxt;
3929 name_map_item_t *item;
3930 name_map_item_t *inserted_item;
3932 item = malloc(sizeof(*item));
3937 inserted_item = (name_map_item_t *) cl_qmap_insert(map, guid, &item->item);
3938 if (inserted_item != item)
3944 static int fabric_read_guid_files(IN ftree_fabric_t * p_ftree)
3948 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3950 if (fabric_cns_provided(p_ftree)) {
3951 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3952 "Fetching compute nodes from file %s\n",
3953 p_ftree->p_osm->subn.opt.cn_guid_file);
3955 if (parse_node_map(p_ftree->p_osm->subn.opt.cn_guid_file,
3956 add_guid_item_to_map,
3957 &p_ftree->cn_guid_tbl)) {
3958 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3959 "ERR AB23: " "Problem parsing CN guid file\n");
3964 if (!cl_qmap_count(&p_ftree->cn_guid_tbl)) {
3965 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3967 "Compute node guids file has no valid guids\n");
3973 if (fabric_ios_provided(p_ftree)) {
3974 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3975 "Fetching I/O nodes from file %s\n",
3976 p_ftree->p_osm->subn.opt.io_guid_file);
3978 if (parse_node_map(p_ftree->p_osm->subn.opt.io_guid_file,
3979 add_guid_item_to_map,
3980 &p_ftree->io_guid_tbl)) {
3981 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3982 "ERR AB28: Problem parsing I/O guid file\n");
3987 if (!cl_qmap_count(&p_ftree->io_guid_tbl)) {
3988 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3990 "I/O node guids file has no valid guids\n");
3996 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3998 } /*fabric_read_guid_files() */
4000 /***************************************************
4001 ***************************************************/
4002 /* Get a Sw and remove all depended HCA's, meaning all
4003 * HCA's which this is the only switch they are connected
4005 static int remove_depended_hca(IN ftree_fabric_t *p_ftree, IN ftree_sw_t *p_sw)
4010 uint8_t remote_port_num;
4012 osm_node_t* sw_node;
4013 uint64_t remote_hca_guid;
4015 sw_node = p_sw->p_osm_sw->p_node;
4016 for (port_num = 0; port_num < sw_node->physp_tbl_size; port_num++) {
4017 physp = osm_node_get_physp_ptr(sw_node, port_num);
4018 if (physp && physp->p_remote_physp) {
4019 if (osm_node_get_type(physp->p_remote_physp->p_node) == IB_NODE_TYPE_CA) {
4021 osm_node_get_node_guid(physp->p_remote_physp->p_node);
4022 p_hca = fabric_get_hca_by_guid(p_ftree, remote_hca_guid);
4027 osm_physp_get_port_num(physp->p_remote_physp);
4028 p_hca->disconnected_ports[remote_port_num] = 1;
4034 /***************************************************
4035 ***************************************************/
4036 static void fabric_remove_unranked_sw(IN ftree_fabric_t *p_ftree)
4038 ftree_sw_t *p_sw = NULL;
4039 ftree_sw_t *p_next_sw;
4043 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
4044 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
4046 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
4047 if (!sw_ranked(p_sw)) {
4048 cl_qmap_remove_item(&p_ftree->sw_tbl,&p_sw->map_item);
4049 removed_hca = remove_depended_hca(p_ftree, p_sw);
4050 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4051 "Removing Unranked sw 0x%" PRIx64 " (with %d dependent hca's)\n",
4052 sw_get_guid_ho(p_sw),removed_hca);
4057 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
4058 "Removed %d invalid switches\n", count);
4060 /***************************************************
4061 ***************************************************/
4062 static int construct_fabric(IN void *context)
4064 ftree_fabric_t *p_ftree = context;
4067 OSM_LOG_ENTER(&p_ftree->p_osm->log);
4069 fabric_clear(p_ftree);
4071 if (p_ftree->p_osm->subn.opt.lmc > 0) {
4072 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4073 "LMC > 0 is not supported by fat-tree routing.\n"
4074 "Falling back to default routing\n");
4079 if (cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl) < 2) {
4080 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4081 "Fabric has %u switches - topology is not fat-tree.\n"
4082 "Falling back to default routing\n",
4083 cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl));
4088 if ((cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl) -
4089 cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl)) < 2) {
4090 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4091 "Fabric has %u nodes (%u switches) - topology is not fat-tree.\n"
4092 "Falling back to default routing\n",
4093 cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl),
4094 cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl));
4099 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n"
4100 " |----------------------------------------|\n"
4101 " |- Starting FatTree fabric construction -|\n"
4102 " |----------------------------------------|\n\n");
4104 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4105 "Populating FatTree Switch and CA tables\n");
4106 if (fabric_populate_nodes(p_ftree) != 0) {
4107 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4108 "Fabric topology is not fat-tree - "
4109 "falling back to default routing\n");
4114 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4115 "Reading guid files provided by user\n");
4116 if (fabric_read_guid_files(p_ftree) != 0) {
4117 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4118 "Failed reading guid files - "
4119 "falling back to default routing\n");
4124 if (cl_qmap_count(&p_ftree->hca_tbl) < 2) {
4125 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4126 "Fabric has %u CAs - topology is not fat-tree.\n"
4127 "Falling back to default routing\n",
4128 cl_qmap_count(&p_ftree->hca_tbl));
4133 /* Rank all the switches in the fabric.
4134 After that we will know only fabric max switch rank.
4135 We will be able to check leaf switches rank and the
4136 whole tree rank after filling ports and marking CNs. */
4137 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "Ranking FatTree\n");
4138 if (fabric_rank(p_ftree) != 0) {
4139 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4140 "Failed ranking the tree\n");
4144 fabric_remove_unranked_sw(p_ftree);
4146 if (p_ftree->max_switch_rank == 0 &&
4147 cl_qmap_count(&p_ftree->sw_tbl) > 1) {
4148 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
4149 "ERR AB2B: Found more than one root on fabric with "
4150 "maximum rank 0\n");
4155 /* For each hca and switch, construct array of ports.
4156 This is done after the whole FatTree data structure is ready,
4157 because we want the ports to have pointers to ftree_{sw,hca}_t
4158 objects, and we need the switches to be already ranked because
4159 that's how the port direction is determined. */
4160 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4161 "Populating CA & switch ports\n");
4162 if (fabric_populate_ports(p_ftree) != 0) {
4163 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4164 "Fabric topology is not a fat-tree\n");
4167 } else if (p_ftree->cn_num == 0) {
4168 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4169 "Fabric has no valid compute nodes\n");
4174 /* Now that the CA ports have been created and CNs were marked,
4175 we can complete the fabric ranking - set leaf switches rank. */
4176 fabric_set_leaf_rank(p_ftree);
4178 if (fabric_get_rank(p_ftree) > FAT_TREE_MAX_RANK ||
4179 fabric_get_rank(p_ftree) < FAT_TREE_MIN_RANK) {
4180 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4181 "Fabric rank is %u (should be between %u and %u)\n",
4182 fabric_get_rank(p_ftree), FAT_TREE_MIN_RANK,
4188 /* Mark all the switches in the fabric with rank equal to
4189 p_ftree->leaf_switch_rank and that are also connected to CNs.
4190 As a by-product, this function also runs basic topology
4191 validation - it checks that all the CNs are at the same rank. */
4192 if (fabric_mark_leaf_switches(p_ftree)) {
4193 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4194 "Fabric topology is not a fat-tree\n");
4199 /* Assign index to all the switches in the fabric.
4200 This function also sorts leaf switch array by the switch index,
4201 sorts all the port arrays of the indexed switches by remote
4202 switch index, and creates switch-by-tuple table (sw_by_tuple_tbl) */
4203 fabric_make_indexing(p_ftree);
4205 /* Create leaf switch array sorted by index.
4206 This array contains switches with rank equal to p_ftree->leaf_switch_rank
4207 and that are also connected to CNs (REAL leafs), and it may contain
4208 switches at the same leaf rank w/o CNs, if this is the order of indexing.
4209 In any case, the first and the last switches in the array are REAL leafs. */
4210 if (fabric_create_leaf_switch_array(p_ftree)) {
4211 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4212 "Fabric topology is not a fat-tree\n");
4217 /* calculate and set ftree.max_cn_per_leaf field */
4218 fabric_set_max_cn_per_leaf(p_ftree);
4220 /* print general info about fabric topology */
4221 fabric_dump_general_info(p_ftree);
4223 /* dump full tree topology */
4224 if (OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
4225 fabric_dump(p_ftree);
4227 /* the fabric is required to be PURE fat-tree only if the root
4228 guid file hasn't been provided by user */
4229 if (!fabric_roots_provided(p_ftree) &&
4230 !fabric_validate_topology(p_ftree)) {
4231 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4232 "Fabric topology is not a fat-tree\n");
4237 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4238 "Max LID in switch LFTs: %u\n", p_ftree->lft_max_lid);
4240 /* Build the full lid matrices needed for multicast routing */
4241 osm_ucast_mgr_build_lid_matrices(&p_ftree->p_osm->sm.ucast_mgr);
4245 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4246 "Clearing FatTree Fabric data structures\n");
4247 fabric_clear(p_ftree);
4249 p_ftree->fabric_built = TRUE;
4251 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n"
4252 " |--------------------------------------------------|\n"
4253 " |- Done constructing FatTree fabric (status = %d) -|\n"
4254 " |--------------------------------------------------|\n\n",
4257 OSM_LOG_EXIT(&p_ftree->p_osm->log);
4259 } /* construct_fabric() */
4261 /***************************************************
4262 ***************************************************/
4264 static int do_routing(IN void *context)
4266 ftree_fabric_t *p_ftree = context;
4269 OSM_LOG_ENTER(&p_ftree->p_osm->log);
4271 if (!p_ftree->fabric_built) {
4276 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4277 "Starting FatTree routing\n");
4279 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4280 "Filling switch forwarding tables for Compute Nodes\n");
4281 fabric_route_to_cns(p_ftree);
4283 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4284 "Filling switch forwarding tables for non-CN targets\n");
4285 fabric_route_to_non_cns(p_ftree);
4287 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4288 "Filling switch forwarding tables for switch-to-switch paths\n");
4289 fabric_route_to_switches(p_ftree);
4291 if (p_ftree->p_osm->subn.opt.connect_roots) {
4292 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4293 "Connecting switches that are unreachable within "
4295 fabric_route_roots(p_ftree);
4298 /* for each switch, set its fwd table */
4299 cl_qmap_apply_func(&p_ftree->sw_tbl, set_sw_fwd_table, (void *)p_ftree);
4301 /* write out hca ordering file */
4302 fabric_dump_hca_ordering(p_ftree);
4304 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4305 "FatTree routing is done\n");
4308 OSM_LOG_EXIT(&p_ftree->p_osm->log);
4312 /***************************************************
4313 ***************************************************/
4315 static void delete(IN void *context)
4319 fabric_destroy((ftree_fabric_t *) context);
4322 /***************************************************
4323 ***************************************************/
4325 int osm_ucast_ftree_setup(struct osm_routing_engine *r, osm_opensm_t * p_osm)
4327 ftree_fabric_t *p_ftree = fabric_create();
4331 p_ftree->p_osm = p_osm;
4332 p_ftree->p_subn = p_osm->sm.ucast_mgr.p_subn;
4334 r->context = (void *)p_ftree;
4335 r->build_lid_matrices = construct_fabric;
4336 r->ucast_build_fwd_tables = do_routing;
4337 r->destroy = delete;