2 * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
3 * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved.
4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
6 * This software is available to you under a choice of one of two
7 * licenses. You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
12 * Redistribution and use in source and binary forms, with or
13 * without modification, are permitted provided that the following
16 * - Redistributions of source code must retain the above
17 * copyright notice, this list of conditions and the following
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials
23 * provided with the distribution.
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
38 * Implementation of osm_sm_state_mgr_t.
39 * This file implements the SM State Manager object.
44 #endif /* HAVE_CONFIG_H */
48 #include <iba/ib_types.h>
49 #include <complib/cl_passivelock.h>
50 #include <complib/cl_debug.h>
51 #include <opensm/osm_sm.h>
52 #include <opensm/osm_madw.h>
53 #include <opensm/osm_switch.h>
54 #include <opensm/osm_log.h>
55 #include <opensm/osm_subnet.h>
56 #include <opensm/osm_helper.h>
57 #include <opensm/osm_msgdef.h>
58 #include <opensm/osm_node.h>
59 #include <opensm/osm_port.h>
60 #include <vendor/osm_vendor_api.h>
61 #include <opensm/osm_helper.h>
62 #include <opensm/osm_opensm.h>
64 /**********************************************************************
65 **********************************************************************/
66 void osm_report_sm_state(osm_sm_t * sm)
69 const char *state_str = osm_get_sm_mgr_state_str(sm->p_subn->sm_state);
71 osm_log(sm->p_log, OSM_LOG_SYS, "Entering %s state\n", state_str);
72 snprintf(buf, sizeof(buf), "ENTERING SM %s STATE", state_str);
73 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, buf);
76 /**********************************************************************
77 **********************************************************************/
78 static void __osm_sm_state_mgr_send_master_sm_info_req(osm_sm_t * sm)
80 osm_madw_context_t context;
81 const osm_port_t *p_port;
82 ib_api_status_t status;
84 OSM_LOG_ENTER(sm->p_log);
86 memset(&context, 0, sizeof(context));
87 if (sm->p_subn->sm_state == IB_SMINFO_STATE_STANDBY) {
89 * We are in STANDBY state - this means we need to poll on the master
90 * SM (according to master_guid)
91 * Send a query of SubnGet(SMInfo) to the subn master_sm_base_lid object.
93 p_port = osm_get_port_by_guid(sm->p_subn, sm->master_sm_guid);
96 * We are not in STANDBY - this means we are in MASTER state - so we need
97 * to poll on the SM that is saved in p_polling_sm under sm.
98 * Send a query of SubnGet(SMInfo) to that SM.
100 p_port = sm->p_polling_sm->p_port;
102 if (p_port == NULL) {
103 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3203: "
104 "No port object for GUID 0x%016" PRIx64 "\n",
105 cl_ntoh64(sm->master_sm_guid));
109 context.smi_context.port_guid = p_port->guid;
110 context.smi_context.set_method = FALSE;
112 status = osm_req_get(sm, osm_physp_get_dr_path_ptr(p_port->p_physp),
113 IB_MAD_ATTR_SM_INFO, 0, CL_DISP_MSGID_NONE,
116 if (status != IB_SUCCESS)
117 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3204: "
118 "Failure requesting SMInfo (%s)\n",
119 ib_get_err_str(status));
122 OSM_LOG_EXIT(sm->p_log);
125 /**********************************************************************
126 **********************************************************************/
127 static void __osm_sm_state_mgr_start_polling(osm_sm_t * sm)
129 uint32_t timeout = sm->p_subn->opt.sminfo_polling_timeout;
130 cl_status_t cl_status;
132 OSM_LOG_ENTER(sm->p_log);
135 * Init the retry_number back to zero - need to restart counting
137 sm->retry_number = 0;
140 * Send a SubnGet(SMInfo) query to the current (or new) master found.
142 __osm_sm_state_mgr_send_master_sm_info_req(sm);
145 * Start a timer that will wake up every sminfo_polling_timeout milliseconds.
146 * The callback of the timer will send a SubnGet(SMInfo) to the Master SM
147 * and restart the timer
149 cl_status = cl_timer_start(&sm->polling_timer, timeout);
150 if (cl_status != CL_SUCCESS)
151 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3210: "
152 "Failed to start timer\n");
154 OSM_LOG_EXIT(sm->p_log);
157 /**********************************************************************
158 **********************************************************************/
159 void osm_sm_state_mgr_polling_callback(IN void *context)
161 osm_sm_t *sm = context;
162 uint32_t timeout = sm->p_subn->opt.sminfo_polling_timeout;
163 cl_status_t cl_status;
165 OSM_LOG_ENTER(sm->p_log);
168 * We can be here in one of two cases:
169 * 1. We are a STANDBY sm polling on the master SM.
170 * 2. We are a MASTER sm, waiting for a handover from a remote master sm.
171 * If we are not in one of these cases - don't need to restart the poller.
173 if (!((sm->p_subn->sm_state == IB_SMINFO_STATE_MASTER &&
174 sm->p_polling_sm != NULL) ||
175 (sm->p_subn->sm_state == IB_SMINFO_STATE_STANDBY)))
179 * If we are a STANDBY sm and the osm_exit_flag is set, then let's
180 * signal the subnet_up. This is relevant for the case of running only
181 * once. In that case - the program is stuck until this signal is
182 * received. In other cases - it is not relevant whether or not the
183 * signal is on - since we are currently in exit flow
185 if (sm->p_subn->sm_state == IB_SMINFO_STATE_STANDBY && osm_exit_flag) {
186 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
187 "Signalling subnet_up_event\n");
188 cl_event_signal(&sm->subnet_up_event);
193 * Incr the retry number.
194 * If it reached the max_retry_number in the subnet opt - call
195 * osm_sm_state_mgr_process with signal OSM_SM_SIGNAL_POLLING_TIMEOUT
198 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
199 "Retry number:%d\n", sm->retry_number);
201 if (sm->retry_number >= sm->p_subn->opt.polling_retry_number) {
202 OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
203 "Reached polling_retry_number value in retry_number. "
204 "Go to DISCOVERY state\n");
205 osm_sm_state_mgr_process(sm, OSM_SM_SIGNAL_POLLING_TIMEOUT);
209 /* Send a SubnGet(SMInfo) request to the remote sm (depends on our state) */
210 __osm_sm_state_mgr_send_master_sm_info_req(sm);
212 /* restart the timer */
213 cl_status = cl_timer_start(&sm->polling_timer, timeout);
214 if (cl_status != CL_SUCCESS)
215 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3211: "
216 "Failed to restart timer\n");
219 OSM_LOG_EXIT(sm->p_log);
223 /**********************************************************************
224 **********************************************************************/
225 static void __osm_sm_state_mgr_signal_error(osm_sm_t * sm,
226 IN const osm_sm_signal_t signal)
228 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3207: "
229 "Invalid signal %s in state %s\n",
230 osm_get_sm_mgr_signal_str(signal),
231 osm_get_sm_mgr_state_str(sm->p_subn->sm_state));
234 /**********************************************************************
235 **********************************************************************/
236 void osm_sm_state_mgr_signal_master_is_alive(osm_sm_t * sm)
238 OSM_LOG_ENTER(sm->p_log);
239 sm->retry_number = 0;
240 OSM_LOG_EXIT(sm->p_log);
243 /**********************************************************************
244 **********************************************************************/
245 ib_api_status_t osm_sm_state_mgr_process(osm_sm_t * sm,
246 IN osm_sm_signal_t signal)
248 ib_api_status_t status = IB_SUCCESS;
252 OSM_LOG_ENTER(sm->p_log);
255 * The state lock prevents many race conditions from screwing
256 * up the state transition process.
258 cl_spinlock_acquire(&sm->state_lock);
260 OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
261 "Received signal %s in state %s\n",
262 osm_get_sm_mgr_signal_str(signal),
263 osm_get_sm_mgr_state_str(sm->p_subn->sm_state));
265 switch (sm->p_subn->sm_state) {
266 case IB_SMINFO_STATE_DISCOVERING:
268 case OSM_SM_SIGNAL_DISCOVERY_COMPLETED:
270 * Update the state of the SM to MASTER
272 /* Turn on the first_time_master_sweep flag */
273 sm->p_subn->first_time_master_sweep = TRUE;
274 sm->p_subn->sm_state = IB_SMINFO_STATE_MASTER;
275 osm_report_sm_state(sm);
277 * Make sure to set the subnet master_sm_base_lid
278 * to the sm_base_lid value
280 sm->p_subn->master_sm_base_lid =
281 sm->p_subn->sm_base_lid;
283 case OSM_SM_SIGNAL_MASTER_OR_HIGHER_SM_DETECTED:
285 * Finished all discovery actions - move to STANDBY
288 sm->p_subn->sm_state = IB_SMINFO_STATE_STANDBY;
289 osm_report_sm_state(sm);
291 * Since another SM is doing the LFT config - we should not
292 * ignore the results of it
294 sm->p_subn->ignore_existing_lfts = FALSE;
296 __osm_sm_state_mgr_start_polling(sm);
298 case OSM_SM_SIGNAL_HANDOVER:
300 * Do nothing. We will discover it later on. If we already discovered
301 * this SM, and got the HANDOVER - this means the remote SM is of
302 * lower priority. In this case we will stop polling it (since it is
303 * a lower priority SM in STANDBY state).
307 __osm_sm_state_mgr_signal_error(sm, signal);
308 status = IB_INVALID_PARAMETER;
313 case IB_SMINFO_STATE_STANDBY:
315 case OSM_SM_SIGNAL_POLLING_TIMEOUT:
316 case OSM_SM_SIGNAL_DISCOVER:
318 * case 1: Polling timeout occured - this means that the Master SM
319 * is no longer alive.
320 * case 2: Got a signal to move to DISCOVERING
321 * Move to DISCOVERING state and start sweeping
323 sm->p_subn->sm_state = IB_SMINFO_STATE_DISCOVERING;
324 osm_report_sm_state(sm);
325 sm->p_subn->coming_out_of_standby = TRUE;
326 osm_sm_signal(sm, OSM_SIGNAL_SWEEP);
328 case OSM_SM_SIGNAL_DISABLE:
330 * Update the state to NOT_ACTIVE
332 sm->p_subn->sm_state = IB_SMINFO_STATE_NOTACTIVE;
333 osm_report_sm_state(sm);
334 osm_vendor_set_sm(sm->mad_ctrl.h_bind, FALSE);
336 case OSM_SM_SIGNAL_HANDOVER:
338 * Update the state to MASTER, and start sweeping
339 * OPTIONAL: send ACKNOWLEDGE
341 /* Turn on the first_time_master_sweep flag */
342 sm->p_subn->first_time_master_sweep = TRUE;
343 /* Turn on the force_heavy_sweep - we want a
344 * heavy sweep to occur on the first sweep of this SM. */
345 sm->p_subn->force_heavy_sweep = TRUE;
347 sm->p_subn->sm_state = IB_SMINFO_STATE_MASTER;
348 osm_report_sm_state(sm);
350 * Make sure to set the subnet master_sm_base_lid
351 * to the sm_base_lid value
353 sm->p_subn->master_sm_base_lid =
354 sm->p_subn->sm_base_lid;
355 sm->p_subn->coming_out_of_standby = TRUE;
356 osm_sm_signal(sm, OSM_SIGNAL_SWEEP);
358 case OSM_SM_SIGNAL_ACKNOWLEDGE:
360 * Do nothing - already moved to STANDBY
364 __osm_sm_state_mgr_signal_error(sm, signal);
365 status = IB_INVALID_PARAMETER;
370 case IB_SMINFO_STATE_NOTACTIVE:
372 case OSM_SM_SIGNAL_STANDBY:
374 * Update the state to STANDBY
377 sm->p_subn->sm_state = IB_SMINFO_STATE_STANDBY;
378 osm_report_sm_state(sm);
379 __osm_sm_state_mgr_start_polling(sm);
382 __osm_sm_state_mgr_signal_error(sm, signal);
383 status = IB_INVALID_PARAMETER;
388 case IB_SMINFO_STATE_MASTER:
390 case OSM_SM_SIGNAL_POLLING_TIMEOUT:
392 * we received a polling timeout - this means that we waited for
393 * a remote master sm to send us a handover, but didn't get it, and
394 * didn't get a response from that remote sm.
395 * We want to force a heavy sweep - hopefully this occurred because
396 * the remote sm died, and we'll find this out and configure the
397 * subnet after a heavy sweep.
398 * We also want to clear the p_polling_sm object - since we are
399 * done polling on that remote sm - we are sweeping again.
401 case OSM_SM_SIGNAL_HANDOVER:
403 * If we received a handover in a master state - then we want to
404 * force a heavy sweep. This means that either we are in a sweep
405 * currently - in this case - no change, or we are in idle state -
406 * since we recognized a master SM before - so we want to make a
407 * heavy sweep and reconfigure the new subnet.
408 * We also want to clear the p_polling_sm object - since we are
409 * done polling on that remote sm - we got a handover from it.
411 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
412 "Forcing heavy sweep. "
413 "Received OSM_SM_SIGNAL_HANDOVER or OSM_SM_SIGNAL_POLLING_TIMEOUT\n");
414 sm->p_polling_sm = NULL;
415 sm->p_subn->force_heavy_sweep = TRUE;
416 osm_sm_signal(sm, OSM_SIGNAL_SWEEP);
418 case OSM_SM_SIGNAL_HANDOVER_SENT:
420 * Just sent a HANDOVER signal - move to STANDBY
423 sm->p_subn->sm_state = IB_SMINFO_STATE_STANDBY;
424 osm_report_sm_state(sm);
425 __osm_sm_state_mgr_start_polling(sm);
427 case OSM_SM_SIGNAL_WAIT_FOR_HANDOVER:
429 * We found a remote master SM, and we are waiting for it
430 * to handover the mastership to us. Need to start polling
431 * on that SM, to make sure it is alive, if it isn't - then
432 * we should move back to discovering, since something must
433 * have happened to it.
435 __osm_sm_state_mgr_start_polling(sm);
437 case OSM_SM_SIGNAL_DISCOVER:
438 sm->p_subn->sm_state = IB_SMINFO_STATE_DISCOVERING;
439 osm_report_sm_state(sm);
442 __osm_sm_state_mgr_signal_error(sm, signal);
443 status = IB_INVALID_PARAMETER;
449 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3208: "
450 "Invalid state %s\n",
451 osm_get_sm_mgr_state_str(sm->p_subn->sm_state));
455 cl_spinlock_release(&sm->state_lock);
457 OSM_LOG_EXIT(sm->p_log);
461 /**********************************************************************
462 **********************************************************************/
463 ib_api_status_t osm_sm_state_mgr_check_legality(osm_sm_t * sm,
464 IN osm_sm_signal_t signal)
466 ib_api_status_t status = IB_SUCCESS;
470 OSM_LOG_ENTER(sm->p_log);
473 * The state lock prevents many race conditions from screwing
474 * up the state transition process.
476 cl_spinlock_acquire(&sm->state_lock);
478 OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
479 "Received signal %s in state %s\n",
480 osm_get_sm_mgr_signal_str(signal),
481 osm_get_sm_mgr_state_str(sm->p_subn->sm_state));
483 switch (sm->p_subn->sm_state) {
484 case IB_SMINFO_STATE_DISCOVERING:
486 case OSM_SM_SIGNAL_DISCOVERY_COMPLETED:
487 case OSM_SM_SIGNAL_MASTER_OR_HIGHER_SM_DETECTED:
488 case OSM_SM_SIGNAL_HANDOVER:
492 __osm_sm_state_mgr_signal_error(sm, signal);
493 status = IB_INVALID_PARAMETER;
498 case IB_SMINFO_STATE_STANDBY:
500 case OSM_SM_SIGNAL_POLLING_TIMEOUT:
501 case OSM_SM_SIGNAL_DISCOVER:
502 case OSM_SM_SIGNAL_DISABLE:
503 case OSM_SM_SIGNAL_HANDOVER:
504 case OSM_SM_SIGNAL_ACKNOWLEDGE:
508 __osm_sm_state_mgr_signal_error(sm, signal);
509 status = IB_INVALID_PARAMETER;
514 case IB_SMINFO_STATE_NOTACTIVE:
516 case OSM_SM_SIGNAL_STANDBY:
520 __osm_sm_state_mgr_signal_error(sm, signal);
521 status = IB_INVALID_PARAMETER;
526 case IB_SMINFO_STATE_MASTER:
528 case OSM_SM_SIGNAL_HANDOVER:
529 case OSM_SM_SIGNAL_HANDOVER_SENT:
533 __osm_sm_state_mgr_signal_error(sm, signal);
534 status = IB_INVALID_PARAMETER;
540 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3209: "
541 "Invalid state %s\n",
542 osm_get_sm_mgr_state_str(sm->p_subn->sm_state));
543 status = IB_INVALID_PARAMETER;
547 cl_spinlock_release(&sm->state_lock);
549 OSM_LOG_EXIT(sm->p_log);