2 * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
3 * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved.
4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5 * Copyright (c) 2008 Xsigo Systems Inc. All rights reserved.
6 * Copyright (c) 2013 Oracle and/or its affiliates. All rights reserved.
8 * This software is available to you under a choice of one of two
9 * licenses. You may choose to be licensed under the terms of the GNU
10 * General Public License (GPL) Version 2, available from the file
11 * COPYING in the main directory of this source tree, or the
12 * OpenIB.org BSD license below:
14 * Redistribution and use in source and binary forms, with or
15 * without modification, are permitted provided that the following
18 * - Redistributions of source code must retain the above
19 * copyright notice, this list of conditions and the following
22 * - Redistributions in binary form must reproduce the above
23 * copyright notice, this list of conditions and the following
24 * disclaimer in the documentation and/or other materials
25 * provided with the distribution.
27 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
28 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
29 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
30 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
31 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
32 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
33 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
40 * Implementation of osm_drop_mgr_t.
41 * This object represents the Drop Manager object.
42 * This object is part of the opensm family of objects.
47 #endif /* HAVE_CONFIG_H */
51 #include <iba/ib_types.h>
52 #include <complib/cl_qmap.h>
53 #include <complib/cl_passivelock.h>
54 #include <complib/cl_debug.h>
55 #include <complib/cl_ptr_vector.h>
56 #include <opensm/osm_file_ids.h>
57 #define FILE_ID OSM_FILE_DROP_MGR_C
58 #include <opensm/osm_sm.h>
59 #include <opensm/osm_router.h>
60 #include <opensm/osm_switch.h>
61 #include <opensm/osm_node.h>
62 #include <opensm/osm_guid.h>
63 #include <opensm/osm_helper.h>
64 #include <opensm/osm_multicast.h>
65 #include <opensm/osm_remote_sm.h>
66 #include <opensm/osm_inform.h>
67 #include <opensm/osm_ucast_mgr.h>
69 static void drop_mgr_remove_router(osm_sm_t * sm, IN const ib_net64_t portguid)
72 cl_qmap_t *p_rtr_guid_tbl;
74 p_rtr_guid_tbl = &sm->p_subn->rtr_guid_tbl;
75 p_rtr = (osm_router_t *) cl_qmap_remove(p_rtr_guid_tbl, portguid);
76 if (p_rtr != (osm_router_t *) cl_qmap_end(p_rtr_guid_tbl)) {
77 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
78 "Cleaned router for port guid 0x%016" PRIx64 "\n",
80 osm_router_delete(&p_rtr);
84 static void drop_mgr_clean_physp(osm_sm_t * sm, IN osm_physp_t * p_physp)
86 osm_physp_t *p_remote_physp;
87 osm_port_t *p_remote_port;
89 p_remote_physp = osm_physp_get_remote(p_physp);
91 p_remote_port = osm_get_port_by_guid(sm->p_subn,
92 p_remote_physp->port_guid);
95 /* Let's check if this is a case of link that is lost
96 (both ports weren't recognized), or a "hiccup" in the
97 subnet - in which case the remote port was
98 recognized, and its state is ACTIVE.
99 If this is just a "hiccup" - force a heavy sweep in
100 the next sweep. We don't want to lose that part of
102 if (p_remote_port->discovery_count &&
103 osm_physp_get_port_state(p_remote_physp) ==
105 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
106 "Forcing new heavy sweep. Remote "
107 "port 0x%016" PRIx64 " port num: %u "
108 "was recognized in ACTIVE state\n",
109 cl_ntoh64(p_remote_physp->port_guid),
110 p_remote_physp->port_num);
111 sm->p_subn->force_heavy_sweep = TRUE;
114 /* If the remote node is ca or router - need to remove
115 the remote port, since it is no longer reachable.
116 This can be done if we reset the discovery count
117 of the remote port. */
118 if (!p_remote_physp->p_node->sw &&
119 p_remote_physp->port_guid != sm->p_subn->sm_port_guid) {
120 p_remote_port->discovery_count = 0;
121 OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
122 "Resetting discovery count of node: "
123 "0x%016" PRIx64 " port num:%u\n",
124 cl_ntoh64(osm_node_get_node_guid
125 (p_remote_physp->p_node)),
126 p_remote_physp->port_num);
130 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
131 "Unlinking local node 0x%016" PRIx64 ", port %u"
132 "\n\t\t\t\tand remote node 0x%016" PRIx64
134 cl_ntoh64(osm_node_get_node_guid(p_physp->p_node)),
136 cl_ntoh64(osm_node_get_node_guid
137 (p_remote_physp->p_node)),
138 p_remote_physp->port_num);
140 if (sm->ucast_mgr.cache_valid)
141 osm_ucast_cache_add_link(&sm->ucast_mgr, p_physp,
144 osm_physp_unlink(p_physp, p_remote_physp);
148 /* Make port as undiscovered */
149 p_physp->p_node->physp_discovered[p_physp->port_num] = 0;
151 OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
152 "Clearing node 0x%016" PRIx64 " physical port number %u\n",
153 cl_ntoh64(osm_node_get_node_guid(p_physp->p_node)),
156 osm_physp_destroy(p_physp);
159 static void drop_mgr_remove_port(osm_sm_t * sm, IN osm_port_t * p_port)
161 ib_net64_t port_guid;
162 osm_port_t *p_port_check;
163 cl_qmap_t *p_alias_guid_tbl;
164 cl_qmap_t *p_sm_guid_tbl;
165 osm_mcm_port_t *mcm_port;
166 cl_ptr_vector_t *p_port_lid_tbl;
171 osm_remote_sm_t *p_sm;
172 osm_alias_guid_t *p_alias_guid, *p_alias_guid_check;
173 osm_guidinfo_work_obj_t *wobj;
174 cl_list_item_t *item, *next_item;
176 ib_mad_notice_attr_t notice;
177 ib_api_status_t status;
179 OSM_LOG_ENTER(sm->p_log);
181 port_guid = osm_port_get_guid(p_port);
182 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
183 "Unreachable port 0x%016" PRIx64 "\n", cl_ntoh64(port_guid));
186 (osm_port_t *) cl_qmap_get(&sm->p_subn->port_guid_tbl, port_guid);
187 if (p_port_check != p_port) {
188 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0101: "
189 "Port 0x%016" PRIx64 " not in guid table\n",
190 cl_ntoh64(port_guid));
194 /* issue a notice - trap 65 (SM_GID_OUT_OF_SERVICE_TRAP) */
195 /* details of the notice */
196 notice.generic_type = 0x80 | IB_NOTICE_TYPE_SUBN_MGMT; /* is generic subn mgt type */
197 ib_notice_set_prod_type_ho(¬ice, 4); /* A class manager generator */
198 /* endport ceases to be reachable */
199 notice.g_or_v.generic.trap_num = CL_HTON16(SM_GID_OUT_OF_SERVICE_TRAP); /* 65 */
200 /* The sm_base_lid is saved in network order already. */
201 notice.issuer_lid = sm->p_subn->sm_base_lid;
202 /* following C14-72.1.2 and table 119 p725 */
203 /* we need to provide the GID */
204 port_gid.unicast.prefix = sm->p_subn->opt.subnet_prefix;
205 port_gid.unicast.interface_id = port_guid;
206 memcpy(&(notice.data_details.ntc_64_67.gid),
207 &(port_gid), sizeof(ib_gid_t));
209 /* According to page 653 - the issuer gid in this case of trap
210 is the SM gid, since the SM is the initiator of this trap. */
211 notice.issuer_gid.unicast.prefix = sm->p_subn->opt.subnet_prefix;
212 notice.issuer_gid.unicast.interface_id = sm->p_subn->sm_port_guid;
214 status = osm_report_notice(sm->p_log, sm->p_subn, ¬ice);
215 if (status != IB_SUCCESS) {
216 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0103: "
217 "Error sending trap reports (%s)\n",
218 ib_get_err_str(status));
221 next_item = cl_qlist_head(&sm->p_subn->alias_guid_list);
222 while (next_item != cl_qlist_end(&sm->p_subn->alias_guid_list)) {
224 next_item = cl_qlist_next(item);
225 wobj = cl_item_obj(item, wobj, list_item);
226 if (wobj->p_port == p_port) {
227 cl_qlist_remove_item(&sm->p_subn->alias_guid_list,
229 osm_guid_work_obj_delete(wobj);
233 while (!cl_is_qlist_empty(&p_port->mcm_list)) {
234 mcm_port = cl_item_obj(cl_qlist_head(&p_port->mcm_list),
235 mcm_port, list_item);
236 osm_mgrp_delete_port(sm->p_subn, sm->p_log, mcm_port->mgrp,
240 p_alias_guid_tbl = &sm->p_subn->alias_port_guid_tbl;
241 p_alias_guid_check = (osm_alias_guid_t *) cl_qmap_head(p_alias_guid_tbl);
242 while (p_alias_guid_check != (osm_alias_guid_t *) cl_qmap_end(p_alias_guid_tbl)) {
243 if (p_alias_guid_check->p_base_port == p_port)
244 p_alias_guid = p_alias_guid_check;
247 p_alias_guid_check = (osm_alias_guid_t *) cl_qmap_next(&p_alias_guid_check->map_item);
249 cl_qmap_remove_item(p_alias_guid_tbl,
250 &p_alias_guid->map_item);
251 osm_alias_guid_delete(&p_alias_guid);
255 cl_qmap_remove(&sm->p_subn->port_guid_tbl, port_guid);
257 p_sm_guid_tbl = &sm->p_subn->sm_guid_tbl;
258 p_sm = (osm_remote_sm_t *) cl_qmap_remove(p_sm_guid_tbl, port_guid);
259 if (p_sm != (osm_remote_sm_t *) cl_qmap_end(p_sm_guid_tbl)) {
260 /* need to remove this item */
261 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
262 "Cleaned SM for port guid 0x%016" PRIx64 "\n",
263 cl_ntoh64(port_guid));
267 drop_mgr_remove_router(sm, port_guid);
269 osm_port_get_lid_range_ho(p_port, &min_lid_ho, &max_lid_ho);
271 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
272 "Clearing abandoned LID range [%u,%u]\n",
273 min_lid_ho, max_lid_ho);
275 p_port_lid_tbl = &sm->p_subn->port_lid_tbl;
276 for (lid_ho = min_lid_ho; lid_ho <= max_lid_ho; lid_ho++)
277 cl_ptr_vector_set(p_port_lid_tbl, lid_ho, NULL);
279 drop_mgr_clean_physp(sm, p_port->p_physp);
281 /* Delete event forwarding subscriptions */
282 if (sm->p_subn->opt.drop_event_subscriptions) {
283 if (osm_infr_remove_subscriptions(sm->p_subn, sm->p_log, port_guid)
285 OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
286 "Removed event subscriptions for port 0x%016" PRIx64 "\n",
287 cl_ntoh64(port_guid));
290 /* initialize the p_node - may need to get node_desc later */
291 p_node = p_port->p_node;
293 osm_port_delete(&p_port);
295 OSM_LOG(sm->p_log, OSM_LOG_INFO,
296 "Removed port with GUID:0x%016" PRIx64
297 " LID range [%u, %u] of node:%s\n",
298 cl_ntoh64(port_gid.unicast.interface_id),
299 min_lid_ho, max_lid_ho,
300 p_node ? p_node->print_desc : "UNKNOWN");
303 OSM_LOG_EXIT(sm->p_log);
306 static void drop_mgr_remove_switch(osm_sm_t * sm, IN osm_node_t * p_node)
309 cl_qmap_t *p_sw_guid_tbl;
310 ib_net64_t node_guid;
312 OSM_LOG_ENTER(sm->p_log);
314 node_guid = osm_node_get_node_guid(p_node);
315 p_sw_guid_tbl = &sm->p_subn->sw_guid_tbl;
317 p_sw = (osm_switch_t *) cl_qmap_remove(p_sw_guid_tbl, node_guid);
318 if (p_sw == (osm_switch_t *) cl_qmap_end(p_sw_guid_tbl)) {
319 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0102: "
320 "Node 0x%016" PRIx64 " not in switch table\n",
321 cl_ntoh64(osm_node_get_node_guid(p_node)));
324 osm_switch_delete(&p_sw);
327 OSM_LOG_EXIT(sm->p_log);
330 static boolean_t drop_mgr_process_node(osm_sm_t * sm, IN osm_node_t * p_node)
332 osm_physp_t *p_physp;
334 osm_node_t *p_node_check;
337 ib_net64_t port_guid;
338 boolean_t return_val = FALSE;
340 OSM_LOG_ENTER(sm->p_log);
342 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
343 "Unreachable node 0x%016" PRIx64 "\n",
344 cl_ntoh64(osm_node_get_node_guid(p_node)));
346 if (sm->ucast_mgr.cache_valid)
347 osm_ucast_cache_add_node(&sm->ucast_mgr, p_node);
350 Delete all the logical and physical port objects
351 associated with this node.
353 max_ports = osm_node_get_num_physp(p_node);
354 for (port_num = 0; port_num < max_ports; port_num++) {
355 p_physp = osm_node_get_physp_ptr(p_node, port_num);
357 port_guid = osm_physp_get_port_guid(p_physp);
359 p_port = osm_get_port_by_guid(sm->p_subn, port_guid);
362 drop_mgr_remove_port(sm, p_port);
364 drop_mgr_clean_physp(sm, p_physp);
371 drop_mgr_remove_switch(sm, p_node);
374 (osm_node_t *) cl_qmap_remove(&sm->p_subn->node_guid_tbl,
375 osm_node_get_node_guid(p_node));
376 if (p_node_check != p_node) {
377 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0105: "
378 "Node 0x%016" PRIx64 " not in guid table\n",
379 cl_ntoh64(osm_node_get_node_guid(p_node)));
382 /* free memory allocated to node */
383 osm_node_delete(&p_node);
385 OSM_LOG_EXIT(sm->p_log);
389 static void drop_mgr_check_switch_node(osm_sm_t * sm, IN osm_node_t * p_node)
391 ib_net64_t node_guid;
392 osm_physp_t *p_physp, *p_remote_physp;
393 osm_node_t *p_remote_node;
395 ib_net64_t port_guid;
396 uint8_t port_num, remote_port_num;
398 OSM_LOG_ENTER(sm->p_log);
400 node_guid = osm_node_get_node_guid(p_node);
402 /* Make sure we have a switch object for this node */
404 /* We do not have switch info for this node */
405 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
406 "Node 0x%016" PRIx64 " no switch in table\n",
407 cl_ntoh64(node_guid));
409 drop_mgr_process_node(sm, p_node);
413 /* Make sure we have a port object for port zero */
414 p_physp = osm_node_get_physp_ptr(p_node, 0);
416 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
417 "Node 0x%016" PRIx64 " no valid physical port 0\n",
418 cl_ntoh64(node_guid));
420 drop_mgr_process_node(sm, p_node);
424 port_guid = osm_physp_get_port_guid(p_physp);
426 p_port = osm_get_port_by_guid(sm->p_subn, port_guid);
429 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
430 "Node 0x%016" PRIx64 " has no port object\n",
431 cl_ntoh64(node_guid));
433 drop_mgr_process_node(sm, p_node);
437 if (!p_node->physp_discovered[0]) {
438 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
439 "Node 0x%016" PRIx64 " port has discovery count zero\n",
440 cl_ntoh64(node_guid));
442 drop_mgr_process_node(sm, p_node);
447 * Unlink all ports that havn't been discovered during the last sweep.
448 * Optimization: Skip the check if discovered all the ports of the switch.
450 if (p_port->discovery_count < p_node->physp_tbl_size) {
451 for (port_num = 1; port_num < p_node->physp_tbl_size; port_num++) {
452 if (!p_node->physp_discovered[port_num]) {
453 p_physp = osm_node_get_physp_ptr(p_node, port_num);
456 p_remote_physp = osm_physp_get_remote(p_physp);
461 osm_physp_get_node_ptr(p_remote_physp);
463 osm_physp_get_port_num(p_remote_physp);
465 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
466 "Unlinking local node 0x%" PRIx64
468 "\n\t\t\t\tand remote node 0x%" PRIx64
469 ", port %u due to missing PortInfo\n",
470 cl_ntoh64(osm_node_get_node_guid
472 cl_ntoh64(osm_node_get_node_guid
476 if (sm->ucast_mgr.cache_valid)
477 osm_ucast_cache_add_link(&sm->ucast_mgr,
481 osm_node_unlink(p_node, (uint8_t) port_num,
483 (uint8_t) remote_port_num);
488 OSM_LOG_EXIT(sm->p_log);
492 void osm_drop_mgr_process(osm_sm_t * sm)
494 cl_qmap_t *p_node_guid_tbl, *p_port_guid_tbl;
495 osm_port_t *p_port, *p_next_port;
496 osm_node_t *p_node, *p_next_node;
497 int max_ports, port_num;
498 osm_physp_t *p_physp;
499 ib_net64_t port_guid;
503 OSM_LOG_ENTER(sm->p_log);
505 p_node_guid_tbl = &sm->p_subn->node_guid_tbl;
506 p_port_guid_tbl = &sm->p_subn->port_guid_tbl;
508 CL_PLOCK_EXCL_ACQUIRE(sm->p_lock);
510 p_next_node = (osm_node_t *) cl_qmap_head(p_node_guid_tbl);
511 while (p_next_node != (osm_node_t *) cl_qmap_end(p_node_guid_tbl)) {
512 p_node = p_next_node;
514 (osm_node_t *) cl_qmap_next(&p_next_node->map_item);
516 CL_ASSERT(cl_qmap_key(&p_node->map_item) ==
517 osm_node_get_node_guid(p_node));
519 OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
520 "Checking node 0x%016" PRIx64 "\n",
521 cl_ntoh64(osm_node_get_node_guid(p_node)));
524 Check if this node was discovered during the last sweep.
525 If not, it is unreachable in the current subnet, and
526 should therefore be removed from the subnet object.
528 if (p_node->discovery_count == 0)
529 drop_mgr_process_node(sm, p_node);
532 * We want to preserve the configured pkey indexes,
533 * so if we don't receive GetResp P_KeyTable for some block,
535 * 1. Drop node if the node is sw and got timeout for port 0.
536 * 2. Drop node if node is HCA/RTR.
537 * 3. Drop only physp if got timeout for sw when the port isn't 0.
538 * We'll set error during initialization in order to
539 * cause an immediate heavy sweep and try to get the
540 * configured P_KeyTable again.
542 if (osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH)
546 max_ports = osm_node_get_num_physp(p_node);
547 for (; port_num < max_ports; port_num++) {
548 p_physp = osm_node_get_physp_ptr(p_node, port_num);
549 if (!p_physp || p_physp->pkeys.rcv_blocks_cnt == 0)
551 p_physp->pkeys.rcv_blocks_cnt = 0;
552 p_physp->need_update = 2;
553 sm->p_subn->subnet_initialization_error = TRUE;
554 port_guid = osm_physp_get_port_guid(p_physp);
555 p_port = osm_get_port_by_guid(sm->p_subn, port_guid);
557 if (p_node->physp_discovered[port_num]) {
558 p_node->physp_discovered[port_num] = 0;
559 p_port->discovery_count--;
566 Go over all the nodes. If the node is a switch - make sure
567 there is also a switch record for it, and a portInfo record for
568 port zero of of the node.
569 If not - this means that there was some error in getting the data
570 of this node. Drop the node.
572 p_next_node = (osm_node_t *) cl_qmap_head(p_node_guid_tbl);
573 while (p_next_node != (osm_node_t *) cl_qmap_end(p_node_guid_tbl)) {
574 p_node = p_next_node;
576 (osm_node_t *) cl_qmap_next(&p_next_node->map_item);
578 OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
579 "Checking full discovery of node 0x%016" PRIx64 "\n",
580 cl_ntoh64(osm_node_get_node_guid(p_node)));
582 if (osm_node_get_type(p_node) != IB_NODE_TYPE_SWITCH)
585 /* We are handling a switch node */
586 drop_mgr_check_switch_node(sm, p_node);
589 p_next_port = (osm_port_t *) cl_qmap_head(p_port_guid_tbl);
590 while (p_next_port != (osm_port_t *) cl_qmap_end(p_port_guid_tbl)) {
591 p_port = p_next_port;
593 (osm_port_t *) cl_qmap_next(&p_next_port->map_item);
595 CL_ASSERT(cl_qmap_key(&p_port->map_item) ==
596 osm_port_get_guid(p_port));
598 OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
599 "Checking port 0x%016" PRIx64 "\n",
600 cl_ntoh64(osm_port_get_guid(p_port)));
603 If the port is unreachable, remove it from the guid table.
605 if (p_port->discovery_count == 0)
606 drop_mgr_remove_port(sm, p_port);
609 CL_PLOCK_RELEASE(sm->p_lock);
610 OSM_LOG_EXIT(sm->p_log);