2 * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
3 * Copyright (c) 2002-2006 Mellanox Technologies LTD. All rights reserved.
4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
6 * This software is available to you under a choice of one of two
7 * licenses. You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
12 * Redistribution and use in source and binary forms, with or
13 * without modification, are permitted provided that the following
16 * - Redistributions of source code must retain the above
17 * copyright notice, this list of conditions and the following
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials
23 * provided with the distribution.
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
38 * Implementation of Up Down Algorithm using ranking & Min Hop
39 * Calculation functions
44 #endif /* HAVE_CONFIG_H */
48 #include <complib/cl_debug.h>
49 #include <complib/cl_qmap.h>
50 #include <opensm/osm_switch.h>
51 #include <opensm/osm_opensm.h>
52 #include <opensm/osm_ucast_mgr.h>
54 /* //////////////////////////// */
56 /* //////////////////////////// */
59 typedef enum updn_switch_dir {
74 updn_switch_dir_t dir;
79 /**********************************************************************
80 **********************************************************************/
81 /* This function returns direction based on rank and guid info of current &
83 static updn_switch_dir_t updn_get_dir(unsigned cur_rank, unsigned rem_rank,
84 uint64_t cur_id, uint64_t rem_id)
86 /* HACK: comes to solve root nodes connection, in a classic subnet root nodes do not connect
87 directly, but in case they are we assign to root node an UP direction to allow UPDN to discover
88 the subnet correctly (and not from the point of view of the last root node).
90 if (!cur_rank && !rem_rank)
93 if (cur_rank < rem_rank)
95 else if (cur_rank > rem_rank)
98 /* Equal rank, decide by id number, bigger == UP direction */
106 /**********************************************************************
107 * This function does the bfs of min hop table calculation by guid index
108 * as a starting point.
109 **********************************************************************/
110 static int updn_bfs_by_node(IN osm_log_t * p_log, IN osm_subn_t * p_subn,
111 IN osm_switch_t * p_sw)
117 updn_switch_dir_t next_dir, current_dir;
119 OSM_LOG_ENTER(p_log);
121 lid = osm_node_get_base_lid(p_sw->p_node, 0);
122 lid = cl_ntoh16(lid);
123 osm_switch_set_hops(p_sw, lid, 0, 0);
125 OSM_LOG(p_log, OSM_LOG_DEBUG,
126 "Starting from switch - port GUID 0x%" PRIx64 " lid %u\n",
127 cl_ntoh64(p_sw->p_node->node_info.port_guid), lid);
132 /* Update list with the new element */
133 cl_qlist_init(&list);
134 cl_qlist_insert_tail(&list, &u->list);
136 /* BFS the list till no next element */
137 while (!cl_is_qlist_empty(&list)) {
138 u = (struct updn_node *)cl_qlist_remove_head(&list);
139 u->visited = 0; /* cleanup */
140 current_dir = u->dir;
141 /* Go over all ports of the switch and find unvisited remote nodes */
142 for (pn = 1; pn < u->sw->num_ports; pn++) {
143 osm_node_t *p_remote_node;
144 struct updn_node *rem_u;
145 uint8_t current_min_hop, remote_min_hop,
146 set_hop_return_value;
147 osm_switch_t *p_remote_sw;
150 osm_node_get_remote_node(u->sw->p_node, pn,
152 /* If no remote node OR remote node is not a SWITCH
153 continue to next pn */
154 if (!p_remote_node || !p_remote_node->sw)
156 /* Fetch remote guid only after validation of remote node */
157 p_remote_sw = p_remote_node->sw;
158 rem_u = p_remote_sw->priv;
159 /* Decide which direction to mark it (UP/DOWN) */
160 next_dir = updn_get_dir(u->rank, rem_u->rank,
163 /* Check if this is a legal step : the only illegal step is going
165 if ((current_dir == DOWN) && (next_dir == UP)) {
166 OSM_LOG(p_log, OSM_LOG_DEBUG,
167 "Avoiding move from 0x%016" PRIx64
168 " to 0x%016" PRIx64 "\n",
169 cl_ntoh64(osm_node_get_node_guid(u->sw->p_node)),
170 cl_ntoh64(osm_node_get_node_guid(p_remote_node)));
174 /* Set MinHop value for the current lid */
175 current_min_hop = osm_switch_get_least_hops(u->sw, lid);
176 /* Check hop count if better insert into list && update
177 the remote node Min Hop Table */
179 osm_switch_get_hop_count(p_remote_sw, lid, pn_rem);
180 if (current_min_hop + 1 < remote_min_hop) {
181 set_hop_return_value =
182 osm_switch_set_hops(p_remote_sw, lid,
184 current_min_hop + 1);
185 if (set_hop_return_value) {
186 OSM_LOG(p_log, OSM_LOG_ERROR, "ERR AA01: "
187 "Invalid value returned from set min hop is: %d\n",
188 set_hop_return_value);
190 /* Check if remote port has already been visited */
191 if (!rem_u->visited) {
192 /* Insert updn_switch item into the list */
193 rem_u->dir = next_dir;
195 cl_qlist_insert_tail(&list,
206 /**********************************************************************
207 **********************************************************************/
208 /* NOTE : PLS check if we need to decide that the first */
209 /* rank is a SWITCH for BFS purpose */
210 static int updn_subn_rank(IN updn_t * p_updn)
213 osm_physp_t *p_physp, *p_remote_physp;
216 struct updn_node *u, *remote_u;
217 uint8_t num_ports, port_num;
218 osm_log_t *p_log = &p_updn->p_osm->log;
219 unsigned max_rank = 0;
221 OSM_LOG_ENTER(p_log);
222 cl_qlist_init(&list);
224 /* add all roots to the list */
225 for (item = cl_qmap_head(&p_updn->p_osm->subn.sw_guid_tbl);
226 item != cl_qmap_end(&p_updn->p_osm->subn.sw_guid_tbl);
227 item = cl_qmap_next(item)) {
228 p_sw = (osm_switch_t *)item;
231 cl_qlist_insert_tail(&list, &u->list);
234 /* BFS the list till it's empty */
235 while (!cl_is_qlist_empty(&list)) {
236 u = (struct updn_node *)cl_qlist_remove_head(&list);
237 /* Go over all remote nodes and rank them (if not already visited) */
239 num_ports = p_sw->num_ports;
240 OSM_LOG(p_log, OSM_LOG_DEBUG,
241 "Handling switch GUID 0x%" PRIx64 "\n",
242 cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)));
243 for (port_num = 1; port_num < num_ports; port_num++) {
244 ib_net64_t port_guid;
246 /* Current port fetched in order to get remote side */
248 osm_node_get_physp_ptr(p_sw->p_node, port_num);
253 p_remote_physp = p_physp->p_remote_physp;
256 make sure that all the following occur on p_remote_physp:
257 1. The port isn't NULL
260 if (p_remote_physp && p_remote_physp->p_node->sw) {
261 remote_u = p_remote_physp->p_node->sw->priv;
262 port_guid = p_remote_physp->port_guid;
264 if (remote_u->rank > u->rank + 1) {
265 remote_u->rank = u->rank + 1;
266 max_rank = remote_u->rank;
267 cl_qlist_insert_tail(&list,
269 OSM_LOG(p_log, OSM_LOG_DEBUG,
270 "Rank of port GUID 0x%" PRIx64
271 " = %u\n", cl_ntoh64(port_guid),
278 /* Print Summary of ranking */
279 OSM_LOG(p_log, OSM_LOG_VERBOSE,
280 "Subnet ranking completed. Max Node Rank = %d\n", max_rank);
285 /**********************************************************************
286 **********************************************************************/
287 /* hack: preserve min hops entries to any other root switches */
288 static void updn_clear_root_hops(updn_t * p_updn, osm_switch_t * p_sw)
293 for (i = 0; i < p_sw->num_hops; i++)
296 cl_ptr_vector_get(&p_updn->p_osm->subn.port_lid_tbl,
298 if (!p_port || !p_port->p_node->sw
299 || ((struct updn_node *)p_port->p_node->sw->priv)->
301 memset(p_sw->hops[i], 0xff, p_sw->num_ports);
305 /**********************************************************************
306 **********************************************************************/
307 static int updn_set_min_hop_table(IN updn_t * p_updn)
309 osm_subn_t *p_subn = &p_updn->p_osm->subn;
310 osm_log_t *p_log = &p_updn->p_osm->log;
314 OSM_LOG_ENTER(p_log);
316 /* Go over all the switches in the subnet - for each init their Min Hop
318 OSM_LOG(p_log, OSM_LOG_VERBOSE,
319 "Init Min Hop Table of all switches [\n");
321 for (item = cl_qmap_head(&p_updn->p_osm->subn.sw_guid_tbl);
322 item != cl_qmap_end(&p_updn->p_osm->subn.sw_guid_tbl);
323 item = cl_qmap_next(item)) {
324 p_sw = (osm_switch_t *)item;
325 /* Clear Min Hop Table */
326 if (p_subn->opt.connect_roots)
327 updn_clear_root_hops(p_updn, p_sw);
329 osm_switch_clear_hops(p_sw);
332 OSM_LOG(p_log, OSM_LOG_VERBOSE,
333 "Init Min Hop Table of all switches ]\n");
335 /* Now do the BFS for each port in the subnet */
336 OSM_LOG(p_log, OSM_LOG_VERBOSE,
337 "BFS through all port guids in the subnet [\n");
339 for (item = cl_qmap_head(&p_updn->p_osm->subn.sw_guid_tbl);
340 item != cl_qmap_end(&p_updn->p_osm->subn.sw_guid_tbl);
341 item = cl_qmap_next(item)) {
342 p_sw = (osm_switch_t *)item;
343 updn_bfs_by_node(p_log, p_subn, p_sw);
346 OSM_LOG(p_log, OSM_LOG_VERBOSE,
347 "BFS through all port guids in the subnet ]\n");
353 /**********************************************************************
354 **********************************************************************/
355 static int updn_build_lid_matrices(IN updn_t * p_updn)
359 OSM_LOG_ENTER(&p_updn->p_osm->log);
361 OSM_LOG(&p_updn->p_osm->log, OSM_LOG_VERBOSE,
362 "Ranking all port guids in the list\n");
363 if (!p_updn->num_roots) {
364 OSM_LOG(&p_updn->p_osm->log, OSM_LOG_ERROR, "ERR AA0A: "
365 "No guids were provided or number of guids is 0\n");
370 /* Check if it's not a switched subnet */
371 if (cl_is_qmap_empty(&p_updn->p_osm->subn.sw_guid_tbl)) {
372 OSM_LOG(&p_updn->p_osm->log, OSM_LOG_ERROR, "ERR AAOB: "
373 "This is not a switched subnet, cannot perform UPDN algorithm\n");
378 /* Rank the subnet switches */
379 updn_subn_rank(p_updn);
381 /* After multiple ranking need to set Min Hop Table by UpDn algorithm */
382 OSM_LOG(&p_updn->p_osm->log, OSM_LOG_VERBOSE,
383 "Setting all switches' Min Hop Table\n");
384 status = updn_set_min_hop_table(p_updn);
387 OSM_LOG_EXIT(&p_updn->p_osm->log);
391 /**********************************************************************
392 **********************************************************************/
393 static struct updn_node *create_updn_node(osm_switch_t * sw)
397 u = malloc(sizeof(*u));
400 memset(u, 0, sizeof(*u));
402 u->id = cl_ntoh64(osm_node_get_node_guid(sw->p_node));
403 u->rank = 0xffffffff;
407 static void delete_updn_node(struct updn_node *u)
413 /**********************************************************************
414 **********************************************************************/
415 /* Find Root nodes automatically by Min Hop Table info */
416 static void updn_find_root_nodes_by_min_hop(OUT updn_t * p_updn)
418 osm_opensm_t *p_osm = p_updn->p_osm;
421 osm_physp_t *p_physp;
424 unsigned i, cas_num = 0;
425 unsigned *cas_per_sw;
428 OSM_LOG_ENTER(&p_osm->log);
430 OSM_LOG(&p_osm->log, OSM_LOG_DEBUG,
431 "Current number of ports in the subnet is %d\n",
432 cl_qmap_count(&p_osm->subn.port_guid_tbl));
434 cas_per_sw = malloc((IB_LID_UCAST_END_HO + 1) * sizeof(*cas_per_sw));
436 OSM_LOG(&p_osm->log, OSM_LOG_ERROR, "ERR AA14: "
437 "cannot alloc mem for CAs per switch counter array\n");
440 memset(cas_per_sw, 0, (IB_LID_UCAST_END_HO + 1) * sizeof(*cas_per_sw));
442 /* Find the Maximum number of CAs (and routers) for histogram normalization */
443 OSM_LOG(&p_osm->log, OSM_LOG_VERBOSE,
444 "Finding the number of CAs and storing them in cl_map\n");
445 for (item = cl_qmap_head(&p_updn->p_osm->subn.port_guid_tbl);
446 item != cl_qmap_end(&p_updn->p_osm->subn.port_guid_tbl);
447 item = cl_qmap_next(item)) {
448 p_port = (osm_port_t *)item;
449 if (!p_port->p_node->sw) {
450 p_physp = p_port->p_physp->p_remote_physp;
451 if (!p_physp || !p_physp->p_node->sw)
453 lid_ho = osm_node_get_base_lid(p_physp->p_node, 0);
454 lid_ho = cl_ntoh16(lid_ho);
455 cas_per_sw[lid_ho]++;
460 thd1 = cas_num * 0.9;
461 thd2 = cas_num * 0.05;
462 OSM_LOG(&p_osm->log, OSM_LOG_DEBUG,
463 "Found %u CAs and RTRs, %u SWs in the subnet. "
464 "Thresholds are thd1 = %f && thd2 = %f\n",
465 cas_num, cl_qmap_count(&p_osm->subn.sw_guid_tbl), thd1, thd2);
467 OSM_LOG(&p_osm->log, OSM_LOG_VERBOSE,
468 "Passing through all switches to collect Min Hop info\n");
469 for (item = cl_qmap_head(&p_updn->p_osm->subn.sw_guid_tbl);
470 item != cl_qmap_end(&p_updn->p_osm->subn.sw_guid_tbl);
471 item = cl_qmap_next(item)) {
472 unsigned hop_hist[IB_SUBNET_PATH_HOPS_MAX];
475 uint16_t numHopBarsOverThd1 = 0;
476 uint16_t numHopBarsOverThd2 = 0;
478 p_sw = (osm_switch_t *) item;
480 memset(hop_hist, 0, sizeof(hop_hist));
482 max_lid_ho = p_sw->max_lid_ho;
483 for (lid_ho = 1; lid_ho <= max_lid_ho; lid_ho++)
484 if (cas_per_sw[lid_ho]) {
486 osm_switch_get_least_hops(p_sw, lid_ho);
487 if (hop_val >= IB_SUBNET_PATH_HOPS_MAX)
490 hop_hist[hop_val] += cas_per_sw[lid_ho];
493 /* Now recognize the spines by requiring one bar to be
494 above 90% of the number of CAs and RTRs */
495 for (i = 0; i < IB_SUBNET_PATH_HOPS_MAX; i++) {
496 if (hop_hist[i] > thd1)
497 numHopBarsOverThd1++;
498 if (hop_hist[i] > thd2)
499 numHopBarsOverThd2++;
502 /* If thd conditions are valid - rank the root node */
503 if (numHopBarsOverThd1 == 1 && numHopBarsOverThd2 == 1) {
504 OSM_LOG(&p_osm->log, OSM_LOG_DEBUG,
505 "Ranking GUID 0x%" PRIx64 " as root node\n",
506 cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)));
507 ((struct updn_node *)p_sw->priv)->rank = 0;
514 OSM_LOG_EXIT(&p_osm->log);
518 /**********************************************************************
519 **********************************************************************/
520 static void dump_roots(cl_map_item_t *item, FILE *file, void *cxt)
522 osm_switch_t *sw = (osm_switch_t *)item;
523 if (!((struct updn_node *)sw->priv)->rank)
524 fprintf(file, "0x%" PRIx64 "\n",
525 cl_ntoh64(osm_node_get_node_guid(sw->p_node)));
528 static int update_id(void *cxt, uint64_t guid, char *p)
530 osm_opensm_t *osm = cxt;
535 sw = osm_get_switch_by_guid(&osm->subn, cl_hton64(guid));
537 OSM_LOG(&osm->log, OSM_LOG_VERBOSE,
538 "switch with guid 0x%" PRIx64 " is not found\n", guid);
542 id = strtoull(p, &e, 0);
543 if (*e && !isspace(*e)) {
544 OSM_LOG(&osm->log, OSM_LOG_ERROR,
545 "ERR: cannot parse node id \'%s\'", p);
549 OSM_LOG(&osm->log, OSM_LOG_DEBUG,
550 "update node 0x%" PRIx64 " id to 0x%" PRIx64 "\n", guid, id);
552 ((struct updn_node *)sw->priv)->id = id;
557 static int rank_root_node(void *cxt, uint64_t guid, char *p)
562 sw = osm_get_switch_by_guid(&updn->p_osm->subn, cl_hton64(guid));
564 OSM_LOG(&updn->p_osm->log, OSM_LOG_VERBOSE,
565 "switch with guid 0x%" PRIx64 " is not found\n", guid);
569 OSM_LOG(&updn->p_osm->log, OSM_LOG_DEBUG,
570 "Ranking root port GUID 0x%" PRIx64 "\n", guid);
572 ((struct updn_node *)sw->priv)->rank = 0;
578 /* UPDN callback function */
579 static int updn_lid_matrices(void *ctx)
581 updn_t *p_updn = ctx;
586 OSM_LOG_ENTER(&p_updn->p_osm->log);
588 for (item = cl_qmap_head(&p_updn->p_osm->subn.sw_guid_tbl);
589 item != cl_qmap_end(&p_updn->p_osm->subn.sw_guid_tbl);
590 item = cl_qmap_next(item)) {
591 p_sw = (osm_switch_t *)item;
592 p_sw->priv = create_updn_node(p_sw);
594 OSM_LOG(&(p_updn->p_osm->log), OSM_LOG_ERROR, "ERR AA0C: "
595 "cannot create updn node\n");
596 OSM_LOG_EXIT(&p_updn->p_osm->log);
601 /* First setup root nodes */
602 p_updn->num_roots = 0;
604 if (p_updn->p_osm->subn.opt.root_guid_file) {
605 OSM_LOG(&p_updn->p_osm->log, OSM_LOG_DEBUG,
606 "UPDN - Fetching root nodes from file \'%s\'\n",
607 p_updn->p_osm->subn.opt.root_guid_file);
609 ret = parse_node_map(p_updn->p_osm->subn.opt.root_guid_file,
610 rank_root_node, p_updn);
612 OSM_LOG(&p_updn->p_osm->log, OSM_LOG_ERROR, "ERR : "
613 "cannot parse root guids file \'%s\'\n",
614 p_updn->p_osm->subn.opt.root_guid_file);
615 if (p_updn->p_osm->subn.opt.connect_roots &&
616 p_updn->num_roots > 1)
617 osm_ucast_mgr_build_lid_matrices(&p_updn->p_osm->sm.ucast_mgr);
619 osm_ucast_mgr_build_lid_matrices(&p_updn->p_osm->sm.ucast_mgr);
620 updn_find_root_nodes_by_min_hop(p_updn);
623 if (p_updn->p_osm->subn.opt.ids_guid_file) {
624 OSM_LOG(&p_updn->p_osm->log, OSM_LOG_DEBUG,
625 "UPDN - update node ids from file \'%s\'\n",
626 p_updn->p_osm->subn.opt.ids_guid_file);
628 ret = parse_node_map(p_updn->p_osm->subn.opt.ids_guid_file,
629 update_id, p_updn->p_osm);
631 OSM_LOG(&p_updn->p_osm->log, OSM_LOG_ERROR, "ERR : "
632 "cannot parse node ids file \'%s\'\n",
633 p_updn->p_osm->subn.opt.ids_guid_file);
636 /* Only if there are assigned root nodes do the algorithm, otherwise perform do nothing */
637 if (p_updn->num_roots) {
638 OSM_LOG(&p_updn->p_osm->log, OSM_LOG_DEBUG,
639 "activating UPDN algorithm\n");
640 ret = updn_build_lid_matrices(p_updn);
642 OSM_LOG(&p_updn->p_osm->log, OSM_LOG_INFO,
643 "disabling UPDN algorithm, no root nodes were found\n");
647 if (osm_log_is_active(&p_updn->p_osm->log, OSM_LOG_ROUTING))
648 osm_dump_qmap_to_file(p_updn->p_osm, "opensm-updn-roots.dump",
649 &p_updn->p_osm->subn.sw_guid_tbl,
652 for (item = cl_qmap_head(&p_updn->p_osm->subn.sw_guid_tbl);
653 item != cl_qmap_end(&p_updn->p_osm->subn.sw_guid_tbl);
654 item = cl_qmap_next(item)) {
655 p_sw = (osm_switch_t *) item;
656 delete_updn_node(p_sw->priv);
659 OSM_LOG_EXIT(&p_updn->p_osm->log);
663 /**********************************************************************
664 **********************************************************************/
665 static void updn_delete(void *context)
670 int osm_ucast_updn_setup(struct osm_routing_engine *r, osm_opensm_t *osm)
674 updn = malloc(sizeof(updn_t));
677 memset(updn, 0, sizeof(updn_t));
682 r->delete = updn_delete;
683 r->build_lid_matrices = updn_lid_matrices;