]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/ofed/opensm/opensm/osm_switch.c
Merge ACPICA 20190329.
[FreeBSD/FreeBSD.git] / contrib / ofed / opensm / opensm / osm_switch.c
1 /*
2  * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
3  * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved.
4  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5  * Copyright (c) 2009 HNR Consulting. All rights reserved.
6  *
7  * This software is available to you under a choice of one of two
8  * licenses.  You may choose to be licensed under the terms of the GNU
9  * General Public License (GPL) Version 2, available from the file
10  * COPYING in the main directory of this source tree, or the
11  * OpenIB.org BSD license below:
12  *
13  *     Redistribution and use in source and binary forms, with or
14  *     without modification, are permitted provided that the following
15  *     conditions are met:
16  *
17  *      - Redistributions of source code must retain the above
18  *        copyright notice, this list of conditions and the following
19  *        disclaimer.
20  *
21  *      - Redistributions in binary form must reproduce the above
22  *        copyright notice, this list of conditions and the following
23  *        disclaimer in the documentation and/or other materials
24  *        provided with the distribution.
25  *
26  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
30  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33  * SOFTWARE.
34  *
35  */
36
37 /*
38  * Abstract:
39  *    Implementation of osm_switch_t.
40  * This object represents an Infiniband switch.
41  * This object is part of the opensm family of objects.
42  */
43
44 #if HAVE_CONFIG_H
45 #  include <config.h>
46 #endif                          /* HAVE_CONFIG_H */
47
48 #include <stdlib.h>
49 #include <string.h>
50 #include <complib/cl_math.h>
51 #include <iba/ib_types.h>
52 #include <opensm/osm_file_ids.h>
53 #define FILE_ID OSM_FILE_SWITCH_C
54 #include <opensm/osm_switch.h>
55
56 struct switch_port_path {
57         uint8_t port_num;
58         uint32_t path_count;
59         int found_sys_guid;
60         int found_node_guid;
61         uint32_t forwarded_to;
62 };
63
64 cl_status_t osm_switch_set_hops(IN osm_switch_t * p_sw, IN uint16_t lid_ho,
65                                 IN uint8_t port_num, IN uint8_t num_hops)
66 {
67         if (!lid_ho || lid_ho > p_sw->max_lid_ho)
68                 return -1;
69         if (port_num >= p_sw->num_ports)
70                 return -1;
71         if (!p_sw->hops[lid_ho]) {
72                 p_sw->hops[lid_ho] = malloc(p_sw->num_ports);
73                 if (!p_sw->hops[lid_ho])
74                         return -1;
75                 memset(p_sw->hops[lid_ho], OSM_NO_PATH, p_sw->num_ports);
76         }
77
78         p_sw->hops[lid_ho][port_num] = num_hops;
79         if (p_sw->hops[lid_ho][0] > num_hops)
80                 p_sw->hops[lid_ho][0] = num_hops;
81
82         return 0;
83 }
84
85 void osm_switch_delete(IN OUT osm_switch_t ** pp_sw)
86 {
87         osm_switch_t *p_sw = *pp_sw;
88         unsigned i;
89
90         osm_mcast_tbl_destroy(&p_sw->mcast_tbl);
91         if (p_sw->p_prof)
92                 free(p_sw->p_prof);
93         if (p_sw->search_ordering_ports)
94                 free(p_sw->search_ordering_ports);
95         if (p_sw->lft)
96                 free(p_sw->lft);
97         if (p_sw->new_lft)
98                 free(p_sw->new_lft);
99         if (p_sw->hops) {
100                 for (i = 0; i < p_sw->num_hops; i++)
101                         if (p_sw->hops[i])
102                                 free(p_sw->hops[i]);
103                 free(p_sw->hops);
104         }
105         free(*pp_sw);
106         *pp_sw = NULL;
107 }
108
109 osm_switch_t *osm_switch_new(IN osm_node_t * p_node,
110                              IN const osm_madw_t * p_madw)
111 {
112         osm_switch_t *p_sw;
113         ib_switch_info_t *p_si;
114         ib_smp_t *p_smp;
115         uint8_t num_ports;
116         uint32_t port_num;
117
118         CL_ASSERT(p_madw);
119         CL_ASSERT(p_node);
120
121         p_smp = osm_madw_get_smp_ptr(p_madw);
122         p_si = ib_smp_get_payload_ptr(p_smp);
123         num_ports = osm_node_get_num_physp(p_node);
124
125         CL_ASSERT(p_smp->attr_id == IB_MAD_ATTR_SWITCH_INFO);
126
127         if (!p_si->lin_cap) /* The switch doesn't support LFT */
128                 return NULL;
129
130         p_sw = malloc(sizeof(*p_sw));
131         if (!p_sw)
132                 return NULL;
133
134         memset(p_sw, 0, sizeof(*p_sw));
135
136         p_sw->p_node = p_node;
137         p_sw->switch_info = *p_si;
138         p_sw->num_ports = num_ports;
139         p_sw->need_update = 2;
140
141         p_sw->p_prof = malloc(sizeof(*p_sw->p_prof) * num_ports);
142         if (!p_sw->p_prof)
143                 goto err;
144
145         memset(p_sw->p_prof, 0, sizeof(*p_sw->p_prof) * num_ports);
146
147         osm_mcast_tbl_init(&p_sw->mcast_tbl, osm_node_get_num_physp(p_node),
148                            cl_ntoh16(p_si->mcast_cap));
149
150         for (port_num = 0; port_num < num_ports; port_num++)
151                 osm_port_prof_construct(&p_sw->p_prof[port_num]);
152
153         return p_sw;
154
155 err:
156         osm_switch_delete(&p_sw);
157         return NULL;
158 }
159
160 boolean_t osm_switch_get_lft_block(IN const osm_switch_t * p_sw,
161                                    IN uint16_t block_id, OUT uint8_t * p_block)
162 {
163         uint16_t base_lid_ho = block_id * IB_SMP_DATA_SIZE;
164
165         CL_ASSERT(p_sw);
166         CL_ASSERT(p_block);
167
168         if (base_lid_ho > p_sw->max_lid_ho)
169                 return FALSE;
170
171         CL_ASSERT(base_lid_ho + IB_SMP_DATA_SIZE - 1 <= IB_LID_UCAST_END_HO);
172         memcpy(p_block, &(p_sw->new_lft[base_lid_ho]), IB_SMP_DATA_SIZE);
173         return TRUE;
174 }
175
176 static struct osm_remote_node *
177 switch_find_guid_common(IN const osm_switch_t * p_sw,
178                         IN struct osm_remote_guids_count *r,
179                         IN uint8_t port_num, IN int find_sys_guid,
180                         IN int find_node_guid)
181 {
182         struct osm_remote_node *p_remote_guid = NULL;
183         osm_physp_t *p_physp;
184         osm_physp_t *p_rem_physp;
185         osm_node_t *p_rem_node;
186         uint64_t sys_guid;
187         uint64_t node_guid;
188         unsigned int i;
189
190         CL_ASSERT(p_sw);
191
192         if (!r)
193                 goto out;
194
195         p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num);
196         if (!p_physp)
197                 goto out;
198
199         p_rem_physp = osm_physp_get_remote(p_physp);
200         p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
201         sys_guid = p_rem_node->node_info.sys_guid;
202         node_guid = p_rem_node->node_info.node_guid;
203
204         for (i = 0; i < r->count; i++) {
205                 if ((!find_sys_guid
206                      || r->guids[i].node->node_info.sys_guid == sys_guid)
207                     && (!find_node_guid
208                         || r->guids[i].node->node_info.node_guid == node_guid)) {
209                         p_remote_guid = &r->guids[i];
210                         break;
211                 }
212         }
213
214 out:
215         return p_remote_guid;
216 }
217
218 static struct osm_remote_node *
219 switch_find_sys_guid_count(IN const osm_switch_t * p_sw,
220                            IN struct osm_remote_guids_count *r,
221                            IN uint8_t port_num)
222 {
223         return switch_find_guid_common(p_sw, r, port_num, 1, 0);
224 }
225
226 static struct osm_remote_node *
227 switch_find_node_guid_count(IN const osm_switch_t * p_sw,
228                             IN struct osm_remote_guids_count *r,
229                             IN uint8_t port_num)
230 {
231         return switch_find_guid_common(p_sw, r, port_num, 0, 1);
232 }
233
234 uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
235                                   IN osm_port_t * p_port, IN uint16_t lid_ho,
236                                   IN unsigned start_from,
237                                   IN boolean_t ignore_existing,
238                                   IN boolean_t routing_for_lmc,
239                                   IN boolean_t dor,
240                                   IN boolean_t port_shifting,
241                                   IN uint32_t scatter_ports,
242                                   IN osm_lft_type_enum lft_enum)
243 {
244         /*
245            We support an enhanced LMC aware routing mode:
246            In the case of LMC > 0, we can track the remote side
247            system and node for all of the lids of the target
248            and try and avoid routing again through the same
249            system / node.
250
251            Assume if routing_for_lmc is true that this procedure was
252            provided the tracking array and counter via p_port->priv,
253            and we can conduct this algorithm.
254          */
255         uint16_t base_lid;
256         uint8_t hops;
257         uint8_t least_hops;
258         uint8_t port_num;
259         uint8_t num_ports;
260         uint32_t least_paths = 0xFFFFFFFF;
261         unsigned i;
262         /*
263            The following will track the least paths if the
264            route should go through a new system/node
265          */
266         uint32_t least_paths_other_sys = 0xFFFFFFFF;
267         uint32_t least_paths_other_nodes = 0xFFFFFFFF;
268         uint32_t least_forwarded_to = 0xFFFFFFFF;
269         uint32_t check_count;
270         uint8_t best_port = 0;
271         /*
272            These vars track the best port if it connects to
273            not used system/node.
274          */
275         uint8_t best_port_other_sys = 0;
276         uint8_t best_port_other_node = 0;
277         boolean_t port_found = FALSE;
278         osm_physp_t *p_physp;
279         osm_physp_t *p_rem_physp;
280         osm_node_t *p_rem_node;
281         osm_node_t *p_rem_node_first = NULL;
282         struct osm_remote_node *p_remote_guid = NULL;
283         struct osm_remote_node null_remote_node = {NULL, 0, 0};
284         struct switch_port_path port_paths[IB_NODE_NUM_PORTS_MAX];
285         unsigned int port_paths_total_paths = 0;
286         unsigned int port_paths_count = 0;
287         uint8_t scatter_possible_ports[IB_NODE_NUM_PORTS_MAX];
288         unsigned int scatter_possible_ports_count = 0;
289         int found_sys_guid = 0;
290         int found_node_guid = 0;
291
292         CL_ASSERT(lid_ho > 0);
293
294         if (p_port->p_node->sw) {
295                 if (p_port->p_node->sw == p_sw)
296                         return 0;
297                 base_lid = osm_port_get_base_lid(p_port);
298         } else {
299                 p_physp = p_port->p_physp;
300                 if (!p_physp || !p_physp->p_remote_physp ||
301                     !p_physp->p_remote_physp->p_node->sw)
302                         return OSM_NO_PATH;
303
304                 if (p_physp->p_remote_physp->p_node->sw == p_sw)
305                         return p_physp->p_remote_physp->port_num;
306                 base_lid =
307                     osm_node_get_base_lid(p_physp->p_remote_physp->p_node, 0);
308         }
309         base_lid = cl_ntoh16(base_lid);
310
311         num_ports = p_sw->num_ports;
312
313         least_hops = osm_switch_get_least_hops(p_sw, base_lid);
314         if (least_hops == OSM_NO_PATH)
315                 return OSM_NO_PATH;
316
317         /*
318            First, inquire with the forwarding table for an existing
319            route.  If one is found, honor it unless:
320            1. the ignore existing flag is set.
321            2. the physical port is not a valid one or not healthy
322            3. the physical port has a remote port (the link is up)
323            4. the port has min-hops to the target (avoid loops)
324          */
325         if (!ignore_existing) {
326                 port_num = osm_switch_get_port_by_lid(p_sw, lid_ho, lft_enum);
327
328                 if (port_num != OSM_NO_PATH) {
329                         CL_ASSERT(port_num < num_ports);
330
331                         p_physp =
332                             osm_node_get_physp_ptr(p_sw->p_node, port_num);
333                         /*
334                            Don't be too trusting of the current forwarding table!
335                            Verify that the port number is legal and that the
336                            LID is reachable through this port.
337                          */
338                         if (p_physp && osm_physp_is_healthy(p_physp) &&
339                             osm_physp_get_remote(p_physp)) {
340                                 hops =
341                                     osm_switch_get_hop_count(p_sw, base_lid,
342                                                              port_num);
343                                 /*
344                                    If we aren't using pre-defined user routes
345                                    function, then we need to make sure that the
346                                    current path is the minimum one. In case of
347                                    having such a user function - this check will
348                                    not be done, and the old routing will be used.
349                                    Note: This means that it is the user's job to
350                                    clean all data in the forwarding tables that
351                                    he wants to be overridden by the minimum
352                                    hop function.
353                                  */
354                                 if (hops == least_hops)
355                                         return port_num;
356                         }
357                 }
358         }
359
360         /*
361            This algorithm selects a port based on a static load balanced
362            selection across equal hop-count ports.
363            There is lots of room for improved sophistication here,
364            possibly guided by user configuration info.
365          */
366
367         /*
368            OpenSM routing is "local" - not considering a full lid to lid
369            path. As such we can not guarantee a path will not loop if we
370            do not always follow least hops.
371            So we must abort if not least hops.
372          */
373
374         /* port number starts with one and num_ports is 1 + num phys ports */
375         for (i = start_from; i < start_from + num_ports; i++) {
376                 port_num = osm_switch_get_dimn_port(p_sw, i % num_ports);
377                 if (!port_num ||
378                     osm_switch_get_hop_count(p_sw, base_lid, port_num) !=
379                     least_hops)
380                         continue;
381
382                 /* let us make sure it is not down or unhealthy */
383                 p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num);
384                 if (!p_physp || !osm_physp_is_healthy(p_physp) ||
385                     /*
386                        we require all - non sma ports to be linked
387                        to be routed through
388                      */
389                     !osm_physp_get_remote(p_physp))
390                         continue;
391
392                 /*
393                    We located a least-hop port, possibly one of many.
394                    For this port, check the running total count of
395                    the number of paths through this port.  Select
396                    the port routing the least number of paths.
397                  */
398                 check_count =
399                     osm_port_prof_path_count_get(&p_sw->p_prof[port_num]);
400
401
402                 if (dor) {
403                         /* Get the Remote Node */
404                         p_rem_physp = osm_physp_get_remote(p_physp);
405                         p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
406                         /* use the first dimension, but spread traffic
407                          * out among the group of ports representing
408                          * that dimension */
409                         if (!p_rem_node_first)
410                                 p_rem_node_first = p_rem_node;
411                         else if (p_rem_node != p_rem_node_first)
412                                 continue;
413                         if (routing_for_lmc) {
414                                 struct osm_remote_guids_count *r = p_port->priv;
415                                 uint8_t rem_port = osm_physp_get_port_num(p_rem_physp);
416                                 unsigned int j;
417
418                                 for (j = 0; j < r->count; j++) {
419                                         p_remote_guid = &r->guids[j];
420                                         if ((p_remote_guid->node == p_rem_node)
421                                             && (p_remote_guid->port == rem_port))
422                                                 break;
423                                 }
424                                 if (j == r->count)
425                                         p_remote_guid = &null_remote_node;
426                         }
427                 /*
428                    Advanced LMC routing requires tracking of the
429                    best port by the node connected to the other side of
430                    it.
431                  */
432                 } else if (routing_for_lmc) {
433                         /* Is the sys guid already used ? */
434                         p_remote_guid = switch_find_sys_guid_count(p_sw,
435                                                                    p_port->priv,
436                                                                    port_num);
437
438                         /* If not update the least hops for this case */
439                         if (!p_remote_guid) {
440                                 if (check_count < least_paths_other_sys) {
441                                         least_paths_other_sys = check_count;
442                                         best_port_other_sys = port_num;
443                                         least_forwarded_to = 0;
444                                 }
445                                 found_sys_guid = 0;
446                         } else {        /* same sys found - try node */
447
448
449                                 /* Else is the node guid already used ? */
450                                 p_remote_guid = switch_find_node_guid_count(p_sw,
451                                                                             p_port->priv,
452                                                                             port_num);
453
454                                 /* If not update the least hops for this case */
455                                 if (!p_remote_guid
456                                     && check_count < least_paths_other_nodes) {
457                                         least_paths_other_nodes = check_count;
458                                         best_port_other_node = port_num;
459                                         least_forwarded_to = 0;
460                                 }
461                                 /* else prior sys and node guid already used */
462
463                                 if (!p_remote_guid)
464                                         found_node_guid = 0;
465                                 else
466                                         found_node_guid = 1;
467                                 found_sys_guid = 1;
468                         }       /* same sys found */
469                 }
470
471                 port_paths[port_paths_count].port_num = port_num;
472                 port_paths[port_paths_count].path_count = check_count;
473                 if (routing_for_lmc) {
474                         port_paths[port_paths_count].found_sys_guid = found_sys_guid;
475                         port_paths[port_paths_count].found_node_guid = found_node_guid;
476                 }
477                 if (routing_for_lmc && p_remote_guid)
478                         port_paths[port_paths_count].forwarded_to = p_remote_guid->forwarded_to;
479                 else
480                         port_paths[port_paths_count].forwarded_to = 0;
481                 port_paths_total_paths += check_count;
482                 port_paths_count++;
483
484                 /* routing for LMC mode */
485                 /*
486                    the count is min but also lower then the max subscribed
487                  */
488                 if (check_count < least_paths) {
489                         port_found = TRUE;
490                         best_port = port_num;
491                         least_paths = check_count;
492                         scatter_possible_ports_count = 0;
493                         scatter_possible_ports[scatter_possible_ports_count++] = port_num;
494                         if (routing_for_lmc
495                             && p_remote_guid
496                             && p_remote_guid->forwarded_to < least_forwarded_to)
497                                 least_forwarded_to = p_remote_guid->forwarded_to;
498                 } else if (scatter_ports
499                            && check_count == least_paths) {
500                         scatter_possible_ports[scatter_possible_ports_count++] = port_num;
501                 } else if (routing_for_lmc
502                            && p_remote_guid
503                            && check_count == least_paths
504                            && p_remote_guid->forwarded_to < least_forwarded_to) {
505                         least_forwarded_to = p_remote_guid->forwarded_to;
506                         best_port = port_num;
507                 }
508         }
509
510         if (port_found == FALSE)
511                 return OSM_NO_PATH;
512
513         if (port_shifting && port_paths_count) {
514                 /* In the port_paths[] array, we now have all the ports that we
515                  * can route out of.  Using some shifting math below, possibly
516                  * select a different one so that lids won't align in LFTs
517                  *
518                  * If lmc > 0, we need to loop through these ports to find the
519                  * least_forwarded_to port, best_port_other_sys, and
520                  * best_port_other_node just like before but through the different
521                  * ordering.
522                  */
523
524                 least_paths = 0xFFFFFFFF;
525                 least_paths_other_sys = 0xFFFFFFFF;
526                 least_paths_other_nodes = 0xFFFFFFFF;
527                 least_forwarded_to = 0xFFFFFFFF;
528                 best_port = 0;
529                 best_port_other_sys = 0;
530                 best_port_other_node = 0;
531
532                 for (i = 0; i < port_paths_count; i++) {
533                         unsigned int idx;
534
535                         idx = (port_paths_total_paths/port_paths_count + i) % port_paths_count;
536
537                         if (routing_for_lmc) {
538                                 if (!port_paths[idx].found_sys_guid
539                                     && port_paths[idx].path_count < least_paths_other_sys) {
540                                         least_paths_other_sys = port_paths[idx].path_count;
541                                         best_port_other_sys = port_paths[idx].port_num;
542                                         least_forwarded_to = 0;
543                                 }
544                                 else if (!port_paths[idx].found_node_guid
545                                          && port_paths[idx].path_count < least_paths_other_nodes) {
546                                         least_paths_other_nodes = port_paths[idx].path_count;
547                                         best_port_other_node = port_paths[idx].port_num;
548                                         least_forwarded_to = 0;
549                                 }
550                         }
551
552                         if (port_paths[idx].path_count < least_paths) {
553                                 best_port = port_paths[idx].port_num;
554                                 least_paths = port_paths[idx].path_count;
555                                 if (routing_for_lmc
556                                     && (port_paths[idx].found_sys_guid
557                                         || port_paths[idx].found_node_guid)
558                                     && port_paths[idx].forwarded_to < least_forwarded_to)
559                                         least_forwarded_to = port_paths[idx].forwarded_to;
560                         }
561                         else if (routing_for_lmc
562                                  && (port_paths[idx].found_sys_guid
563                                      || port_paths[idx].found_node_guid)
564                                  && port_paths[idx].path_count == least_paths
565                                  && port_paths[idx].forwarded_to < least_forwarded_to) {
566                                 least_forwarded_to = port_paths[idx].forwarded_to;
567                                 best_port = port_paths[idx].port_num;
568                         }
569
570                 }
571         }
572
573         /*
574            if we are in enhanced routing mode and the best port is not
575            the local port 0
576          */
577         if (routing_for_lmc && best_port && !scatter_ports) {
578                 /* Select the least hop port of the non used sys first */
579                 if (best_port_other_sys)
580                         best_port = best_port_other_sys;
581                 else if (best_port_other_node)
582                         best_port = best_port_other_node;
583         } else if (scatter_ports) {
584                 /*
585                  * There is some danger that this random could "rebalance" the routes
586                  * every time, to combat this there is a global srandom that
587                  * occurs at the start of every sweep.
588                  */
589                 unsigned int idx = random() % scatter_possible_ports_count;
590                 best_port = scatter_possible_ports[idx];
591         }
592         return best_port;
593 }
594
595 void osm_switch_clear_hops(IN osm_switch_t * p_sw)
596 {
597         unsigned i;
598
599         for (i = 0; i < p_sw->num_hops; i++)
600                 if (p_sw->hops[i])
601                         memset(p_sw->hops[i], OSM_NO_PATH, p_sw->num_ports);
602 }
603
604 static int alloc_lft(IN osm_switch_t * p_sw, uint16_t lids)
605 {
606         uint16_t lft_size;
607
608         /* Ensure LFT is in units of LFT block size */
609         lft_size = (lids / IB_SMP_DATA_SIZE + 1) * IB_SMP_DATA_SIZE;
610         if (lft_size > p_sw->lft_size) {
611                 uint8_t *new_lft = realloc(p_sw->lft, lft_size);
612                 if (!new_lft)
613                         return -1;
614                 memset(new_lft + p_sw->lft_size, OSM_NO_PATH,
615                        lft_size - p_sw->lft_size);
616                 p_sw->lft = new_lft;
617                 p_sw->lft_size = lft_size;
618         }
619
620         return 0;
621 }
622
623 int osm_switch_prepare_path_rebuild(IN osm_switch_t * p_sw, IN uint16_t max_lids)
624 {
625         uint8_t **hops;
626         uint8_t *new_lft;
627         unsigned i;
628
629         if (alloc_lft(p_sw, max_lids))
630                 return -1;
631
632         for (i = 0; i < p_sw->num_ports; i++)
633                 osm_port_prof_construct(&p_sw->p_prof[i]);
634
635         osm_switch_clear_hops(p_sw);
636
637         if (!(new_lft = realloc(p_sw->new_lft, p_sw->lft_size)))
638                 return -1;
639
640         p_sw->new_lft = new_lft;
641
642         memset(p_sw->new_lft, OSM_NO_PATH, p_sw->lft_size);
643
644         if (!p_sw->hops) {
645                 hops = malloc((max_lids + 1) * sizeof(hops[0]));
646                 if (!hops)
647                         return -1;
648                 memset(hops, 0, (max_lids + 1) * sizeof(hops[0]));
649                 p_sw->hops = hops;
650                 p_sw->num_hops = max_lids + 1;
651         } else if (max_lids + 1 > p_sw->num_hops) {
652                 hops = realloc(p_sw->hops, (max_lids + 1) * sizeof(hops[0]));
653                 if (!hops)
654                         return -1;
655                 memset(hops + p_sw->num_hops, 0,
656                        (max_lids + 1 - p_sw->num_hops) * sizeof(hops[0]));
657                 p_sw->hops = hops;
658                 p_sw->num_hops = max_lids + 1;
659         }
660         p_sw->max_lid_ho = max_lids;
661
662         return 0;
663 }
664
665 uint8_t osm_switch_get_port_least_hops(IN const osm_switch_t * p_sw,
666                                        IN const osm_port_t * p_port)
667 {
668         uint16_t lid;
669
670         if (p_port->p_node->sw) {
671                 if (p_port->p_node->sw == p_sw)
672                         return 0;
673                 lid = osm_node_get_base_lid(p_port->p_node, 0);
674                 return osm_switch_get_least_hops(p_sw, cl_ntoh16(lid));
675         } else {
676                 osm_physp_t *p = p_port->p_physp;
677                 uint8_t hops;
678
679                 if (!p || !p->p_remote_physp || !p->p_remote_physp->p_node->sw)
680                         return OSM_NO_PATH;
681                 if (p->p_remote_physp->p_node->sw == p_sw)
682                         return 1;
683                 lid = osm_node_get_base_lid(p->p_remote_physp->p_node, 0);
684                 hops = osm_switch_get_least_hops(p_sw, cl_ntoh16(lid));
685                 return hops != OSM_NO_PATH ? hops + 1 : OSM_NO_PATH;
686         }
687 }
688
689 uint8_t osm_switch_recommend_mcast_path(IN osm_switch_t * p_sw,
690                                         IN osm_port_t * p_port,
691                                         IN uint16_t mlid_ho,
692                                         IN boolean_t ignore_existing)
693 {
694         uint16_t base_lid;
695         uint8_t hops;
696         uint8_t port_num;
697         uint8_t num_ports;
698         uint8_t least_hops;
699
700         CL_ASSERT(mlid_ho >= IB_LID_MCAST_START_HO);
701
702         if (p_port->p_node->sw) {
703                 if (p_port->p_node->sw == p_sw)
704                         return 0;
705                 base_lid = osm_port_get_base_lid(p_port);
706         } else {
707                 osm_physp_t *p_physp = p_port->p_physp;
708                 if (!p_physp || !p_physp->p_remote_physp ||
709                     !p_physp->p_remote_physp->p_node->sw)
710                         return OSM_NO_PATH;
711                 if (p_physp->p_remote_physp->p_node->sw == p_sw)
712                         return p_physp->p_remote_physp->port_num;
713                 base_lid =
714                     osm_node_get_base_lid(p_physp->p_remote_physp->p_node, 0);
715         }
716         base_lid = cl_ntoh16(base_lid);
717         num_ports = p_sw->num_ports;
718
719         /*
720            If the user wants us to ignore existing multicast routes,
721            then simply return the shortest hop count path to the
722            target port.
723
724            Otherwise, return the first port that has a path to the target,
725            picking from the ports that are already in the multicast group.
726          */
727         if (!ignore_existing) {
728                 for (port_num = 1; port_num < num_ports; port_num++) {
729                         if (!osm_mcast_tbl_is_port
730                             (&p_sw->mcast_tbl, mlid_ho, port_num))
731                                 continue;
732                         /*
733                            Don't be too trusting of the current forwarding table!
734                            Verify that the LID is reachable through this port.
735                          */
736                         hops =
737                             osm_switch_get_hop_count(p_sw, base_lid, port_num);
738                         if (hops != OSM_NO_PATH)
739                                 return port_num;
740                 }
741         }
742
743         /*
744            Either no existing mcast paths reach this port or we are
745            ignoring existing paths.
746
747            Determine the best multicast path to the target.  Note that this
748            algorithm is slightly different from the one used for unicast route
749            recommendation.  In this case (multicast), we must NOT
750            perform any sort of load balancing.  We MUST take the FIRST
751            port found that has <= the lowest hop count path.  This prevents
752            more than one multicast path to the same remote switch which
753            prevents a multicast loop.  Multicast loops are bad since the same
754            multicast packet will go around and around, inevitably creating
755            a black hole that will destroy the Earth in a firey conflagration.
756          */
757         least_hops = osm_switch_get_least_hops(p_sw, base_lid);
758         if (least_hops == OSM_NO_PATH)
759                 return OSM_NO_PATH;
760         for (port_num = 1; port_num < num_ports; port_num++)
761                 if (osm_switch_get_hop_count(p_sw, base_lid, port_num) ==
762                     least_hops)
763                         break;
764
765         CL_ASSERT(port_num < num_ports);
766         return port_num;
767 }