2 * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
3 * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved.
4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
6 * This software is available to you under a choice of one of two
7 * licenses. You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
12 * Redistribution and use in source and binary forms, with or
13 * without modification, are permitted provided that the following
16 * - Redistributions of source code must retain the above
17 * copyright notice, this list of conditions and the following
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials
23 * provided with the distribution.
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
38 * Implementation of osm_ucast_mgr_t.
39 * This file implements the Unicast Manager object.
44 #endif /* HAVE_CONFIG_H */
50 #include <iba/ib_types.h>
51 #include <complib/cl_qmap.h>
52 #include <complib/cl_debug.h>
53 #include <complib/cl_qlist.h>
54 #include <opensm/osm_file_ids.h>
55 #define FILE_ID OSM_FILE_UCAST_MGR_C
56 #include <opensm/osm_ucast_mgr.h>
57 #include <opensm/osm_sm.h>
58 #include <opensm/osm_log.h>
59 #include <opensm/osm_node.h>
60 #include <opensm/osm_switch.h>
61 #include <opensm/osm_helper.h>
62 #include <opensm/osm_msgdef.h>
63 #include <opensm/osm_opensm.h>
65 void osm_ucast_mgr_construct(IN osm_ucast_mgr_t * p_mgr)
67 memset(p_mgr, 0, sizeof(*p_mgr));
70 void osm_ucast_mgr_destroy(IN osm_ucast_mgr_t * p_mgr)
74 OSM_LOG_ENTER(p_mgr->p_log);
76 if (p_mgr->cache_valid)
77 osm_ucast_cache_invalidate(p_mgr);
79 OSM_LOG_EXIT(p_mgr->p_log);
82 ib_api_status_t osm_ucast_mgr_init(IN osm_ucast_mgr_t * p_mgr, IN osm_sm_t * sm)
84 ib_api_status_t status = IB_SUCCESS;
86 OSM_LOG_ENTER(sm->p_log);
88 osm_ucast_mgr_construct(p_mgr);
91 p_mgr->p_log = sm->p_log;
92 p_mgr->p_subn = sm->p_subn;
93 p_mgr->p_lock = sm->p_lock;
95 if (sm->p_subn->opt.use_ucast_cache)
96 cl_qmap_init(&p_mgr->cache_sw_tbl);
98 OSM_LOG_EXIT(p_mgr->p_log);
102 /**********************************************************************
103 Add each switch's own and neighbor LIDs to its LID matrix
104 **********************************************************************/
105 static void ucast_mgr_process_hop_0_1(IN cl_map_item_t * p_map_item,
108 osm_switch_t * p_sw = (osm_switch_t *) p_map_item;
109 osm_node_t *p_remote_node;
110 uint16_t lid, remote_lid;
113 lid = cl_ntoh16(osm_node_get_base_lid(p_sw->p_node, 0));
114 osm_switch_set_hops(p_sw, lid, 0, 0);
116 for (i = 1; i < p_sw->num_ports; i++) {
117 osm_physp_t *p = osm_node_get_physp_ptr(p_sw->p_node, i);
118 p_remote_node = (p && p->p_remote_physp) ?
119 p->p_remote_physp->p_node : NULL;
121 if (p_remote_node && p_remote_node->sw &&
122 p_remote_node != p_sw->p_node) {
123 remote_lid = osm_node_get_base_lid(p_remote_node, 0);
124 remote_lid = cl_ntoh16(remote_lid);
125 osm_switch_set_hops(p_sw, remote_lid, i, p->hop_wf);
130 static void ucast_mgr_process_neighbor(IN osm_ucast_mgr_t * p_mgr,
131 IN osm_switch_t * p_this_sw,
132 IN osm_switch_t * p_remote_sw,
134 IN uint8_t remote_port_num)
142 OSM_LOG_ENTER(p_mgr->p_log);
144 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
145 "Node 0x%" PRIx64 ", remote node 0x%" PRIx64
146 ", port %u, remote port %u\n",
147 cl_ntoh64(osm_node_get_node_guid(p_this_sw->p_node)),
148 cl_ntoh64(osm_node_get_node_guid(p_remote_sw->p_node)),
149 port_num, remote_port_num);
151 p = osm_node_get_physp_ptr(p_this_sw->p_node, port_num);
153 for (item = cl_qmap_head(&p_mgr->p_subn->sw_guid_tbl);
154 item != cl_qmap_end(&p_mgr->p_subn->sw_guid_tbl);
155 item = cl_qmap_next(item)) {
156 p_sw = (osm_switch_t *) item;
157 lid_ho = cl_ntoh16(osm_node_get_base_lid(p_sw->p_node, 0));
158 hops = osm_switch_get_least_hops(p_remote_sw, lid_ho);
159 if (hops == OSM_NO_PATH)
163 osm_switch_get_hop_count(p_this_sw, lid_ho, port_num)) {
164 if (osm_switch_set_hops
165 (p_this_sw, lid_ho, port_num, (uint8_t) hops) != 0)
166 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A03: "
167 "cannot set hops for lid %u at switch 0x%"
169 cl_ntoh64(osm_node_get_node_guid
170 (p_this_sw->p_node)));
171 p_mgr->some_hop_count_set = TRUE;
175 OSM_LOG_EXIT(p_mgr->p_log);
178 static struct osm_remote_node *find_and_add_remote_sys(osm_switch_t * sw,
180 boolean_t dor, struct
181 osm_remote_guids_count
185 osm_physp_t *p = osm_node_get_physp_ptr(sw->p_node, port);
186 osm_node_t *node = p->p_remote_physp->p_node;
187 uint8_t rem_port = osm_physp_get_port_num(p->p_remote_physp);
189 for (i = 0; i < r->count; i++)
190 if (r->guids[i].node == node)
191 if (!dor || (r->guids[i].port == rem_port))
194 r->guids[i].node = node;
195 r->guids[i].forwarded_to = 0;
196 r->guids[i].port = rem_port;
201 static void ucast_mgr_process_port(IN osm_ucast_mgr_t * p_mgr,
202 IN osm_switch_t * p_sw,
203 IN osm_port_t * p_port,
204 IN unsigned lid_offset)
210 boolean_t is_ignored_by_port_prof;
211 ib_net64_t node_guid;
212 unsigned start_from = 1;
214 OSM_LOG_ENTER(p_mgr->p_log);
216 osm_port_get_lid_range_ho(p_port, &min_lid_ho, &max_lid_ho);
218 /* If the lids are zero - then there was some problem with
219 * the initialization. Don't handle this port. */
220 if (min_lid_ho == 0 || max_lid_ho == 0) {
221 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A04: "
222 "Port 0x%" PRIx64 " (%s port %d) has LID 0. An "
223 "initialization error occurred. Ignoring port\n",
224 cl_ntoh64(osm_port_get_guid(p_port)),
225 p_port->p_node->print_desc,
226 p_port->p_physp->port_num);
230 lid_ho = min_lid_ho + lid_offset;
232 if (lid_ho > max_lid_ho)
235 if (lid_offset && !p_mgr->is_dor)
236 /* ignore potential overflow - it is handled in osm_switch.c */
238 osm_switch_get_port_by_lid(p_sw, lid_ho - 1, OSM_NEW_LFT) + 1;
240 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
241 "Processing port 0x%" PRIx64
242 " (\'%s\' port %u), LID %u [%u,%u]\n",
243 cl_ntoh64(osm_port_get_guid(p_port)),
244 p_port->p_node->print_desc, p_port->p_physp->port_num, lid_ho,
245 min_lid_ho, max_lid_ho);
247 /* TODO - This should be runtime error, not a CL_ASSERT() */
248 CL_ASSERT(max_lid_ho <= IB_LID_UCAST_END_HO);
250 node_guid = osm_node_get_node_guid(p_sw->p_node);
253 The lid matrix contains the number of hops to each
254 lid from each port. From this information we determine
255 how best to distribute the LID range across the ports
256 that can reach those LIDs.
258 port = osm_switch_recommend_path(p_sw, p_port, lid_ho, start_from,
259 p_mgr->p_subn->ignore_existing_lfts,
260 p_mgr->p_subn->opt.lmc,
262 p_mgr->p_subn->opt.port_shifting,
263 !lid_offset && p_port->use_scatter,
266 if (port == OSM_NO_PATH) {
267 /* do not try to overwrite the ppro of non existing port ... */
268 is_ignored_by_port_prof = TRUE;
270 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
271 "No path to get to LID %u from switch 0x%" PRIx64 "\n",
272 lid_ho, cl_ntoh64(node_guid));
274 osm_physp_t *p = osm_node_get_physp_ptr(p_sw->p_node, port);
278 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
279 "Routing LID %u to port %u for switch 0x%" PRIx64 "\n",
280 lid_ho, port, cl_ntoh64(node_guid));
283 we would like to optionally ignore this port in equalization
284 as in the case of the Mellanox Anafa Internal PCI TCA port
286 is_ignored_by_port_prof = p->is_prof_ignored;
289 We also would ignore this route if the target lid is of
290 a switch and the port_profile_switch_node is not TRUE
292 if (!p_mgr->p_subn->opt.port_profile_switch_nodes)
293 is_ignored_by_port_prof |=
294 (osm_node_get_type(p_port->p_node) ==
295 IB_NODE_TYPE_SWITCH);
299 We have selected the port for this LID.
300 Write it to the forwarding tables.
302 p_sw->new_lft[lid_ho] = port;
303 if (!is_ignored_by_port_prof) {
304 struct osm_remote_node *rem_node_used;
305 osm_switch_count_path(p_sw, port);
306 if (port > 0 && p_port->priv &&
307 (rem_node_used = find_and_add_remote_sys(p_sw, port,
310 rem_node_used->forwarded_to++;
314 OSM_LOG_EXIT(p_mgr->p_log);
317 static void alloc_ports_priv(osm_ucast_mgr_t * mgr)
319 cl_qmap_t *port_tbl = &mgr->p_subn->port_guid_tbl;
320 struct osm_remote_guids_count *r;
325 for (item = cl_qmap_head(port_tbl); item != cl_qmap_end(port_tbl);
326 item = cl_qmap_next(item)) {
327 port = (osm_port_t *) item;
328 lmc = ib_port_info_get_lmc(&port->p_physp->port_info);
329 r = malloc(sizeof(*r) + sizeof(r->guids[0]) * (1 << lmc));
331 OSM_LOG(mgr->p_log, OSM_LOG_ERROR, "ERR 3A09: "
332 "cannot allocate memory to track remote"
333 " systems for lmc > 0\n");
337 memset(r, 0, sizeof(*r) + sizeof(r->guids[0]) * (1 << lmc));
342 static void free_ports_priv(osm_ucast_mgr_t * mgr)
344 cl_qmap_t *port_tbl = &mgr->p_subn->port_guid_tbl;
347 for (item = cl_qmap_head(port_tbl); item != cl_qmap_end(port_tbl);
348 item = cl_qmap_next(item)) {
349 port = (osm_port_t *) item;
357 static void ucast_mgr_process_tbl(IN cl_map_item_t * p_map_item,
360 osm_ucast_mgr_t *p_mgr = context;
361 osm_switch_t * p_sw = (osm_switch_t *) p_map_item;
362 unsigned i, lids_per_port;
364 OSM_LOG_ENTER(p_mgr->p_log);
366 CL_ASSERT(p_sw && p_sw->p_node);
368 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
369 "Processing switch 0x%" PRIx64 "\n",
370 cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)));
372 /* Initialize LIDs in buffer to invalid port number. */
373 memset(p_sw->new_lft, OSM_NO_PATH, p_sw->max_lid_ho + 1);
375 alloc_ports_priv(p_mgr);
378 Iterate through every port setting LID routes for each
379 port based on base LID and LMC value.
381 lids_per_port = 1 << p_mgr->p_subn->opt.lmc;
382 for (i = 0; i < lids_per_port; i++) {
383 cl_qlist_t *list = &p_mgr->port_order_list;
384 cl_list_item_t *item;
385 for (item = cl_qlist_head(list); item != cl_qlist_end(list);
386 item = cl_qlist_next(item)) {
387 osm_port_t *port = cl_item_obj(item, port, list_item);
388 ucast_mgr_process_port(p_mgr, p_sw, port, i);
392 free_ports_priv(p_mgr);
394 OSM_LOG_EXIT(p_mgr->p_log);
397 static void ucast_mgr_process_neighbors(IN cl_map_item_t * p_map_item,
400 osm_switch_t * p_sw = (osm_switch_t *) p_map_item;
401 osm_ucast_mgr_t * p_mgr = context;
403 osm_node_t *p_remote_node;
405 uint8_t remote_port_num;
407 osm_physp_t *p_physp;
409 OSM_LOG_ENTER(p_mgr->p_log);
411 p_node = p_sw->p_node;
414 CL_ASSERT(osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH);
416 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
417 "Processing switch with GUID 0x%" PRIx64 "\n",
418 cl_ntoh64(osm_node_get_node_guid(p_node)));
420 num_ports = osm_node_get_num_physp(p_node);
423 Start with port 1 to skip the switch's management port.
425 for (port_num = 1; port_num < num_ports; port_num++) {
426 p_remote_node = osm_node_get_remote_node(p_node,
429 if (p_remote_node && p_remote_node->sw
430 && (p_remote_node != p_node)) {
431 /* make sure the link is healthy. If it is not - don't
432 propagate through it. */
433 p_physp = osm_node_get_physp_ptr(p_node, port_num);
434 if (!p_physp || !osm_link_is_healthy(p_physp))
437 ucast_mgr_process_neighbor(p_mgr, p_sw,
444 OSM_LOG_EXIT(p_mgr->p_log);
447 static int set_hop_wf(void *ctx, uint64_t guid, char *p)
449 osm_ucast_mgr_t *m = ctx;
450 osm_node_t *node = osm_get_node_by_guid(m->p_subn, cl_hton64(guid));
452 unsigned port, hop_wf;
455 if (!node || !node->sw) {
456 OSM_LOG(m->p_log, OSM_LOG_DEBUG,
457 "switch with guid 0x%016" PRIx64 " is not found\n",
462 if (!p || !*p || !(port = strtoul(p, &e, 0)) || (p == e) ||
463 port >= node->sw->num_ports) {
464 OSM_LOG(m->p_log, OSM_LOG_DEBUG,
465 "bad port specified for guid 0x%016" PRIx64 "\n", guid);
471 if (!*p || !(hop_wf = strtoul(p, &e, 0)) || p == e || hop_wf >= 0x100) {
472 OSM_LOG(m->p_log, OSM_LOG_DEBUG,
473 "bad hop weight factor specified for guid 0x%016" PRIx64
474 "port %u\n", guid, port);
478 physp = osm_node_get_physp_ptr(node, port);
482 physp->hop_wf = hop_wf;
487 static void set_default_hop_wf(cl_map_item_t * p_map_item, void *ctx)
489 osm_switch_t *sw = (osm_switch_t *) p_map_item;
492 for (i = 1; i < sw->num_ports; i++) {
493 osm_physp_t *p = osm_node_get_physp_ptr(sw->p_node, i);
499 static int set_search_ordering_ports(void *ctx, uint64_t guid, char *p)
501 osm_subn_t *p_subn = ctx;
502 osm_node_t *node = osm_get_node_by_guid(p_subn, cl_hton64(guid));
504 uint8_t *search_ordering_ports = NULL;
506 unsigned int *ports = NULL;
507 const int bpw = sizeof(*ports)*8;
509 int i = 1; /* port 0 maps to port 0 */
511 if (!node || !(sw = node->sw)) {
512 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_VERBOSE,
513 "switch with guid 0x%016" PRIx64 " is not found\n",
518 if (sw->search_ordering_ports) {
519 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_VERBOSE,
520 "switch with guid 0x%016" PRIx64 " already listed\n",
525 search_ordering_ports = malloc(sizeof(*search_ordering_ports)*sw->num_ports);
526 if (!search_ordering_ports) {
527 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_ERROR,
528 "ERR 3A07: cannot allocate memory for search_ordering_ports\n");
531 memset(search_ordering_ports, 0, sizeof(*search_ordering_ports)*sw->num_ports);
533 /* the ports array is for record keeping of which ports have
535 words = (sw->num_ports + bpw - 1)/bpw;
536 ports = malloc(words*sizeof(*ports));
538 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_ERROR,
539 "ERR 3A08: cannot allocate memory for ports\n");
540 free(search_ordering_ports);
543 memset(ports, 0, words*sizeof(*ports));
545 while ((*p != '\0') && (*p != '#')) {
548 port = strtoul(p, &e, 0);
549 if ((p == e) || (port == 0) || (port >= sw->num_ports) ||
550 !osm_node_get_physp_ptr(node, port)) {
551 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_VERBOSE,
552 "bad port %d specified for guid 0x%016" PRIx64 "\n",
554 free(search_ordering_ports);
559 if (ports[port/bpw] & (1u << (port%bpw))) {
560 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_VERBOSE,
561 "port %d already specified for guid 0x%016" PRIx64 "\n",
563 free(search_ordering_ports);
568 ports[port/bpw] |= (1u << (port%bpw));
569 search_ordering_ports[i++] = port;
572 while (isspace(*p)) {
578 for (port = 1; port < sw->num_ports; port++) {
579 /* fill out the rest of the search_ordering_ports array
580 * in sequence using the remaining unspecified
583 if (!(ports[port/bpw] & (1u << (port%bpw)))) {
584 search_ordering_ports[i++] = port;
587 sw->search_ordering_ports = search_ordering_ports;
589 free(search_ordering_ports);
596 int osm_ucast_mgr_build_lid_matrices(IN osm_ucast_mgr_t * p_mgr)
599 uint32_t iteration_max;
600 cl_qmap_t *p_sw_guid_tbl;
602 p_sw_guid_tbl = &p_mgr->p_subn->sw_guid_tbl;
604 OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
605 "Starting switches' Min Hop Table Assignment\n");
608 Set up the weighting factors for the routing.
610 cl_qmap_apply_func(p_sw_guid_tbl, set_default_hop_wf, NULL);
611 if (p_mgr->p_subn->opt.hop_weights_file) {
612 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
613 "Fetching hop weight factor file \'%s\'\n",
614 p_mgr->p_subn->opt.hop_weights_file);
615 if (parse_node_map(p_mgr->p_subn->opt.hop_weights_file,
616 set_hop_wf, p_mgr)) {
617 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A05: "
618 "cannot parse hop_weights_file \'%s\'\n",
619 p_mgr->p_subn->opt.hop_weights_file);
624 Set the switch matrices for each switch's own port 0 LID(s)
625 then set the lid matrices for the each switch's leaf nodes.
627 cl_qmap_apply_func(p_sw_guid_tbl, ucast_mgr_process_hop_0_1, p_mgr);
630 Get the switch matrices for each switch's neighbors.
631 This process requires a number of iterations equal to
632 the number of switches in the subnet minus 1.
634 In each iteration, a switch learns the lid/port/hop
635 information (as contained by a switch's lid matrix) from
636 its immediate neighbors. After each iteration, a switch
637 (and it's neighbors) know more routing information than
638 it did on the previous iteration.
639 Thus, by repeatedly absorbing the routing information of
640 neighbor switches, every switch eventually learns how to
641 route all LIDs on the subnet.
643 Note that there may not be any switches in the subnet if
644 we are in simple p2p configuration.
646 iteration_max = cl_qmap_count(p_sw_guid_tbl);
649 If there are switches in the subnet, iterate until the lid
650 matrix has been constructed. Otherwise, just immediately
651 indicate we're done if no switches exist.
657 we need to find out when the propagation of
658 hop counts has relaxed. So this global variable
659 is preset to 0 on each iteration and if
660 if non of the switches was set will exit the
663 p_mgr->some_hop_count_set = TRUE;
664 for (i = 0; (i < iteration_max) && p_mgr->some_hop_count_set;
666 p_mgr->some_hop_count_set = FALSE;
667 cl_qmap_apply_func(p_sw_guid_tbl,
668 ucast_mgr_process_neighbors, p_mgr);
670 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
671 "Min-hop propagated in %d steps\n", i);
677 static int ucast_mgr_setup_all_switches(osm_subn_t * p_subn)
682 lids = (uint16_t) cl_ptr_vector_get_size(&p_subn->port_lid_tbl);
683 lids = lids ? lids - 1 : 0;
685 for (p_sw = (osm_switch_t *) cl_qmap_head(&p_subn->sw_guid_tbl);
686 p_sw != (osm_switch_t *) cl_qmap_end(&p_subn->sw_guid_tbl);
687 p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) {
688 if (osm_switch_prepare_path_rebuild(p_sw, lids)) {
689 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_ERROR, "ERR 3A0B: "
690 "cannot setup switch 0x%016" PRIx64 "\n",
691 cl_ntoh64(osm_node_get_node_guid
695 if (p_sw->search_ordering_ports) {
696 free(p_sw->search_ordering_ports);
697 p_sw->search_ordering_ports = NULL;
701 if (p_subn->opt.port_search_ordering_file) {
702 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_DEBUG,
703 "Fetching dimension ports file \'%s\'\n",
704 p_subn->opt.port_search_ordering_file);
705 if (parse_node_map(p_subn->opt.port_search_ordering_file,
706 set_search_ordering_ports, p_subn)) {
707 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_ERROR, "ERR 3A0F: "
708 "cannot parse port_search_ordering_file \'%s\'\n",
709 p_subn->opt.port_search_ordering_file);
716 static int add_guid_to_order_list(void *ctx, uint64_t guid, char *p)
718 osm_ucast_mgr_t *m = ctx;
719 osm_port_t *port = osm_get_port_by_guid(m->p_subn, cl_hton64(guid));
722 OSM_LOG(m->p_log, OSM_LOG_DEBUG,
723 "port guid not found: 0x%016" PRIx64 "\n", guid);
728 OSM_LOG(m->p_log, OSM_LOG_DEBUG,
729 "port guid specified multiple times 0x%016" PRIx64 "\n",
734 cl_qlist_insert_tail(&m->port_order_list, &port->list_item);
736 port->use_scatter = (m->p_subn->opt.guid_routing_order_no_scatter == TRUE) ? 0 : m->p_subn->opt.scatter_ports;
741 static void add_port_to_order_list(cl_map_item_t * p_map_item, void *ctx)
743 osm_port_t *port = (osm_port_t *) p_map_item;
744 osm_ucast_mgr_t *m = ctx;
747 port->use_scatter = m->p_subn->opt.scatter_ports;
748 cl_qlist_insert_tail(&m->port_order_list, &port->list_item);
753 static int mark_ignored_port(void *ctx, uint64_t guid, char *p)
755 osm_ucast_mgr_t *m = ctx;
756 osm_node_t *node = osm_get_node_by_guid(m->p_subn, cl_hton64(guid));
760 if (!node || !node->sw) {
761 OSM_LOG(m->p_log, OSM_LOG_DEBUG,
762 "switch with guid 0x%016" PRIx64 " is not found\n",
767 if (!p || !*p || !(port = strtoul(p, NULL, 0)) ||
768 port >= node->sw->num_ports) {
769 OSM_LOG(m->p_log, OSM_LOG_DEBUG,
770 "bad port specified for guid 0x%016" PRIx64 "\n", guid);
774 physp = osm_node_get_physp_ptr(node, port);
778 physp->is_prof_ignored = 1;
783 static void clear_prof_ignore_flag(cl_map_item_t * p_map_item, void *ctx)
785 osm_switch_t *sw = (osm_switch_t *) p_map_item;
788 for (i = 1; i < sw->num_ports; i++) {
789 osm_physp_t *p = osm_node_get_physp_ptr(sw->p_node, i);
791 p->is_prof_ignored = 0;
795 static void add_sw_endports_to_order_list(osm_switch_t * sw,
802 for (i = 1; i < sw->num_ports; i++) {
803 p = osm_node_get_physp_ptr(sw->p_node, i);
804 if (p && p->p_remote_physp && !p->p_remote_physp->p_node->sw) {
805 port = osm_get_port_by_guid(m->p_subn,
808 if (!port || port->flag)
810 cl_qlist_insert_tail(&m->port_order_list,
813 port->use_scatter = m->p_subn->opt.scatter_ports;
818 static void sw_count_endport_links(osm_switch_t * sw)
823 sw->endport_links = 0;
824 for (i = 1; i < sw->num_ports; i++) {
825 p = osm_node_get_physp_ptr(sw->p_node, i);
826 if (p && p->p_remote_physp && !p->p_remote_physp->p_node->sw)
831 static int compar_sw_load(const void *s1, const void *s2)
833 #define get_sw_endport_links(s) (*(osm_switch_t **)s)->endport_links
834 return get_sw_endport_links(s2) - get_sw_endport_links(s1);
837 static void sort_ports_by_switch_load(osm_ucast_mgr_t * m)
839 int i, num = cl_qmap_count(&m->p_subn->sw_guid_tbl);
840 void **s = malloc(num * sizeof(*s));
842 OSM_LOG(m->p_log, OSM_LOG_ERROR, "ERR 3A0C: "
843 "No memory, skip by switch load sorting.\n");
846 s[0] = cl_qmap_head(&m->p_subn->sw_guid_tbl);
847 for (i = 1; i < num; i++)
848 s[i] = cl_qmap_next(s[i - 1]);
850 for (i = 0; i < num; i++)
851 sw_count_endport_links(s[i]);
853 qsort(s, num, sizeof(*s), compar_sw_load);
855 for (i = 0; i < num; i++)
856 add_sw_endports_to_order_list(s[i], m);
860 static int ucast_mgr_build_lfts(osm_ucast_mgr_t * p_mgr)
862 cl_qlist_init(&p_mgr->port_order_list);
864 if (p_mgr->p_subn->opt.guid_routing_order_file) {
865 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
866 "Fetching guid routing order file \'%s\'\n",
867 p_mgr->p_subn->opt.guid_routing_order_file);
869 if (parse_node_map(p_mgr->p_subn->opt.guid_routing_order_file,
870 add_guid_to_order_list, p_mgr))
871 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A0D: "
872 "cannot parse guid routing order file \'%s\'\n",
873 p_mgr->p_subn->opt.guid_routing_order_file);
875 sort_ports_by_switch_load(p_mgr);
877 if (p_mgr->p_subn->opt.port_prof_ignore_file) {
878 cl_qmap_apply_func(&p_mgr->p_subn->sw_guid_tbl,
879 clear_prof_ignore_flag, NULL);
880 if (parse_node_map(p_mgr->p_subn->opt.port_prof_ignore_file,
881 mark_ignored_port, p_mgr)) {
882 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A0E: "
883 "cannot parse port prof ignore file \'%s\'\n",
884 p_mgr->p_subn->opt.port_prof_ignore_file);
888 cl_qmap_apply_func(&p_mgr->p_subn->port_guid_tbl,
889 add_port_to_order_list, p_mgr);
891 cl_qmap_apply_func(&p_mgr->p_subn->sw_guid_tbl, ucast_mgr_process_tbl,
894 cl_qlist_remove_all(&p_mgr->port_order_list);
899 static void ucast_mgr_set_fwd_top(IN cl_map_item_t * p_map_item,
902 osm_ucast_mgr_t *p_mgr = cxt;
903 osm_switch_t * p_sw = (osm_switch_t *) p_map_item;
905 osm_physp_t *p_physp;
906 osm_dr_path_t *p_path;
907 osm_madw_context_t context;
908 ib_api_status_t status;
910 boolean_t set_swinfo_require = FALSE;
916 OSM_LOG_ENTER(p_mgr->p_log);
918 CL_ASSERT(p_sw && p_sw->max_lid_ho);
920 p_node = p_sw->p_node;
924 if (p_mgr->max_lid < p_sw->max_lid_ho)
925 p_mgr->max_lid = p_sw->max_lid_ho;
927 p_physp = osm_node_get_physp_ptr(p_node, 0);
931 p_path = osm_physp_get_dr_path_ptr(p_physp);
934 Set the top of the unicast forwarding table.
936 si = p_sw->switch_info;
937 lin_top = cl_hton16(p_sw->max_lid_ho);
938 if (lin_top != si.lin_top) {
939 set_swinfo_require = TRUE;
940 si.lin_top = lin_top;
941 context.si_context.lft_top_change = TRUE;
943 context.si_context.lft_top_change = FALSE;
945 life_state = si.life_state;
946 ib_switch_info_set_life_time(&si, p_mgr->p_subn->opt.packet_life_time);
948 if (life_state != si.life_state)
949 set_swinfo_require = TRUE;
951 if (set_swinfo_require) {
952 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
953 "Setting switch FT top to LID %u\n", p_sw->max_lid_ho);
955 context.si_context.light_sweep = FALSE;
956 context.si_context.node_guid = osm_node_get_node_guid(p_node);
957 context.si_context.set_method = TRUE;
959 status = osm_req_set(p_mgr->sm, p_path, (uint8_t *) & si,
960 sizeof(si), IB_MAD_ATTR_SWITCH_INFO,
962 ib_port_info_get_m_key(&p_physp->port_info),
963 CL_DISP_MSGID_NONE, &context);
965 if (status != IB_SUCCESS)
966 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A06: "
967 "Sending SwitchInfo attribute failed (%s)\n",
968 ib_get_err_str(status));
971 OSM_LOG_EXIT(p_mgr->p_log);
974 static int set_lft_block(IN osm_switch_t *p_sw, IN osm_ucast_mgr_t *p_mgr,
975 IN uint16_t block_id_ho)
977 osm_madw_context_t context;
978 osm_dr_path_t *p_path;
979 osm_physp_t *p_physp;
980 ib_api_status_t status;
983 Send linear forwarding table blocks to the switch
984 as long as the switch indicates it has blocks needing
987 if (!p_sw->new_lft) {
988 /* any routing should provide the new_lft */
989 CL_ASSERT(p_mgr->p_subn->opt.use_ucast_cache &&
990 p_mgr->cache_valid && !p_sw->need_update);
994 p_physp = osm_node_get_physp_ptr(p_sw->p_node, 0);
998 p_path = osm_physp_get_dr_path_ptr(p_physp);
1000 context.lft_context.node_guid = osm_node_get_node_guid(p_sw->p_node);
1001 context.lft_context.set_method = TRUE;
1003 if (!p_sw->need_update && !p_mgr->p_subn->need_update &&
1004 !memcmp(p_sw->new_lft + block_id_ho * IB_SMP_DATA_SIZE,
1005 p_sw->lft + block_id_ho * IB_SMP_DATA_SIZE,
1010 * Zero the stored LFT block, so in case the MAD will end up
1011 * with error, we will resend it in the next sweep.
1013 memset(p_sw->lft + block_id_ho * IB_SMP_DATA_SIZE, 0,
1016 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1017 "Writing FT block %u to switch 0x%" PRIx64 "\n", block_id_ho,
1018 cl_ntoh64(context.lft_context.node_guid));
1020 status = osm_req_set(p_mgr->sm, p_path,
1021 p_sw->new_lft + block_id_ho * IB_SMP_DATA_SIZE,
1022 IB_SMP_DATA_SIZE, IB_MAD_ATTR_LIN_FWD_TBL,
1023 cl_hton32(block_id_ho), FALSE,
1024 ib_port_info_get_m_key(&p_physp->port_info),
1025 CL_DISP_MSGID_NONE, &context);
1027 if (status != IB_SUCCESS) {
1028 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A10: "
1029 "Sending linear fwd. tbl. block failed (%s)\n",
1030 ib_get_err_str(status));
1037 static void ucast_mgr_pipeline_fwd_tbl(osm_ucast_mgr_t * p_mgr)
1040 cl_map_item_t *item;
1041 unsigned i, max_block = p_mgr->max_lid / IB_SMP_DATA_SIZE + 1;
1043 tbl = &p_mgr->p_subn->sw_guid_tbl;
1044 for (i = 0; i < max_block; i++)
1045 for (item = cl_qmap_head(tbl); item != cl_qmap_end(tbl);
1046 item = cl_qmap_next(item))
1047 set_lft_block((osm_switch_t *)item, p_mgr, i);
1050 void osm_ucast_mgr_set_fwd_tables(osm_ucast_mgr_t * p_mgr)
1054 cl_qmap_apply_func(&p_mgr->p_subn->sw_guid_tbl, ucast_mgr_set_fwd_top,
1057 ucast_mgr_pipeline_fwd_tbl(p_mgr);
1060 static int ucast_mgr_route(struct osm_routing_engine *r, osm_opensm_t * osm)
1064 OSM_LOG(&osm->log, OSM_LOG_VERBOSE,
1065 "building routing with \'%s\' routing algorithm...\n", r->name);
1067 /* Set the before each lft build to keep the routes in place between sweeps */
1068 if (osm->subn.opt.scatter_ports)
1069 srandom(osm->subn.opt.scatter_ports);
1071 if (!r->build_lid_matrices ||
1072 (ret = r->build_lid_matrices(r->context)) > 0)
1073 ret = osm_ucast_mgr_build_lid_matrices(&osm->sm.ucast_mgr);
1076 OSM_LOG(&osm->log, OSM_LOG_ERROR,
1077 "%s: cannot build lid matrices\n", r->name);
1081 if (!r->ucast_build_fwd_tables ||
1082 (ret = r->ucast_build_fwd_tables(r->context)) > 0)
1083 ret = ucast_mgr_build_lfts(&osm->sm.ucast_mgr);
1086 OSM_LOG(&osm->log, OSM_LOG_ERROR,
1087 "%s: cannot build fwd tables\n", r->name);
1091 osm->routing_engine_used = r;
1093 osm_ucast_mgr_set_fwd_tables(&osm->sm.ucast_mgr);
1098 int osm_ucast_mgr_process(IN osm_ucast_mgr_t * p_mgr)
1100 osm_opensm_t *p_osm;
1101 struct osm_routing_engine *p_routing_eng;
1102 cl_qmap_t *p_sw_guid_tbl;
1105 OSM_LOG_ENTER(p_mgr->p_log);
1107 p_sw_guid_tbl = &p_mgr->p_subn->sw_guid_tbl;
1108 p_osm = p_mgr->p_subn->p_osm;
1109 p_routing_eng = p_osm->routing_engine_list;
1111 CL_PLOCK_EXCL_ACQUIRE(p_mgr->p_lock);
1114 If there are no switches in the subnet, we are done.
1116 if (cl_qmap_count(p_sw_guid_tbl) == 0 ||
1117 ucast_mgr_setup_all_switches(p_mgr->p_subn) < 0)
1121 p_osm->routing_engine_used = NULL;
1122 while (p_routing_eng) {
1123 failed = ucast_mgr_route(p_routing_eng, p_osm);
1126 p_routing_eng = p_routing_eng->next;
1129 if (!p_osm->routing_engine_used &&
1130 p_osm->no_fallback_routing_engine != TRUE) {
1131 /* If configured routing algorithm failed, use default MinHop */
1132 failed = ucast_mgr_route(p_osm->default_routing_engine, p_osm);
1135 if (p_osm->routing_engine_used) {
1136 OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
1137 "%s tables configured on all switches\n",
1138 osm_routing_engine_type_str(p_osm->
1139 routing_engine_used->type));
1141 if (p_mgr->p_subn->opt.use_ucast_cache)
1142 p_mgr->cache_valid = TRUE;
1144 p_mgr->p_subn->subnet_initialization_error = TRUE;
1145 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1146 "No routing engine able to successfully configure "
1147 " switch tables on current fabric\n");
1150 CL_PLOCK_RELEASE(p_mgr->p_lock);
1151 OSM_LOG_EXIT(p_mgr->p_log);
1155 static int ucast_build_lid_matrices(void *context)
1157 return osm_ucast_mgr_build_lid_matrices(context);
1160 static int ucast_build_lfts(void *context)
1162 return ucast_mgr_build_lfts(context);
1165 int osm_ucast_minhop_setup(struct osm_routing_engine *r, osm_opensm_t * osm)
1167 r->context = &osm->sm.ucast_mgr;
1168 r->build_lid_matrices = ucast_build_lid_matrices;
1169 r->ucast_build_fwd_tables = ucast_build_lfts;
1173 static int ucast_dor_build_lfts(void *context)
1175 osm_ucast_mgr_t *mgr = context;
1179 ret = ucast_mgr_build_lfts(mgr);
1185 int osm_ucast_dor_setup(struct osm_routing_engine *r, osm_opensm_t * osm)
1187 r->context = &osm->sm.ucast_mgr;
1188 r->build_lid_matrices = ucast_build_lid_matrices;
1189 r->ucast_build_fwd_tables = ucast_dor_build_lfts;
1193 int ucast_dummy_build_lid_matrices(void *context)