]> CyberLeo.Net >> Repos - FreeBSD/releng/10.0.git/blob - contrib/ofed/management/opensm/opensm/osm_sm_state_mgr.c
- Copy stable/10 (r259064) to releng/10.0 as part of the
[FreeBSD/releng/10.0.git] / contrib / ofed / management / opensm / opensm / osm_sm_state_mgr.c
1 /*
2  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
3  * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved.
4  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * OpenIB.org BSD license below:
11  *
12  *     Redistribution and use in source and binary forms, with or
13  *     without modification, are permitted provided that the following
14  *     conditions are met:
15  *
16  *      - Redistributions of source code must retain the above
17  *        copyright notice, this list of conditions and the following
18  *        disclaimer.
19  *
20  *      - Redistributions in binary form must reproduce the above
21  *        copyright notice, this list of conditions and the following
22  *        disclaimer in the documentation and/or other materials
23  *        provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  *
34  */
35
36 /*
37  * Abstract:
38  *    Implementation of osm_sm_state_mgr_t.
39  * This file implements the SM State Manager object.
40  */
41
42 #if HAVE_CONFIG_H
43 #  include <config.h>
44 #endif                          /* HAVE_CONFIG_H */
45
46 #include <string.h>
47 #include <time.h>
48 #include <iba/ib_types.h>
49 #include <complib/cl_passivelock.h>
50 #include <complib/cl_debug.h>
51 #include <opensm/osm_sm.h>
52 #include <opensm/osm_madw.h>
53 #include <opensm/osm_switch.h>
54 #include <opensm/osm_log.h>
55 #include <opensm/osm_subnet.h>
56 #include <opensm/osm_helper.h>
57 #include <opensm/osm_msgdef.h>
58 #include <opensm/osm_node.h>
59 #include <opensm/osm_port.h>
60 #include <vendor/osm_vendor_api.h>
61 #include <opensm/osm_helper.h>
62 #include <opensm/osm_opensm.h>
63
64 /**********************************************************************
65  **********************************************************************/
66 void osm_report_sm_state(osm_sm_t * sm)
67 {
68         char buf[64];
69         const char *state_str = osm_get_sm_mgr_state_str(sm->p_subn->sm_state);
70
71         osm_log(sm->p_log, OSM_LOG_SYS, "Entering %s state\n", state_str);
72         snprintf(buf, sizeof(buf), "ENTERING SM %s STATE", state_str);
73         OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, buf);
74 }
75
76 /**********************************************************************
77  **********************************************************************/
78 static void __osm_sm_state_mgr_send_master_sm_info_req(osm_sm_t * sm)
79 {
80         osm_madw_context_t context;
81         const osm_port_t *p_port;
82         ib_api_status_t status;
83
84         OSM_LOG_ENTER(sm->p_log);
85
86         memset(&context, 0, sizeof(context));
87         if (sm->p_subn->sm_state == IB_SMINFO_STATE_STANDBY) {
88                 /*
89                  * We are in STANDBY state - this means we need to poll on the master
90                  * SM (according to master_guid)
91                  * Send a query of SubnGet(SMInfo) to the subn master_sm_base_lid object.
92                  */
93                 p_port = osm_get_port_by_guid(sm->p_subn, sm->master_sm_guid);
94         } else {
95                 /*
96                  * We are not in STANDBY - this means we are in MASTER state - so we need
97                  * to poll on the SM that is saved in p_polling_sm under sm.
98                  * Send a query of SubnGet(SMInfo) to that SM.
99                  */
100                 p_port = sm->p_polling_sm->p_port;
101         }
102         if (p_port == NULL) {
103                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3203: "
104                         "No port object for GUID 0x%016" PRIx64 "\n",
105                         cl_ntoh64(sm->master_sm_guid));
106                 goto Exit;
107         }
108
109         context.smi_context.port_guid = p_port->guid;
110         context.smi_context.set_method = FALSE;
111
112         status = osm_req_get(sm, osm_physp_get_dr_path_ptr(p_port->p_physp),
113                              IB_MAD_ATTR_SM_INFO, 0, CL_DISP_MSGID_NONE,
114                              &context);
115
116         if (status != IB_SUCCESS)
117                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3204: "
118                         "Failure requesting SMInfo (%s)\n",
119                         ib_get_err_str(status));
120
121 Exit:
122         OSM_LOG_EXIT(sm->p_log);
123 }
124
125 /**********************************************************************
126  **********************************************************************/
127 static void __osm_sm_state_mgr_start_polling(osm_sm_t * sm)
128 {
129         uint32_t timeout = sm->p_subn->opt.sminfo_polling_timeout;
130         cl_status_t cl_status;
131
132         OSM_LOG_ENTER(sm->p_log);
133
134         /*
135          * Init the retry_number back to zero - need to restart counting
136          */
137         sm->retry_number = 0;
138
139         /*
140          * Send a SubnGet(SMInfo) query to the current (or new) master found.
141          */
142         __osm_sm_state_mgr_send_master_sm_info_req(sm);
143
144         /*
145          * Start a timer that will wake up every sminfo_polling_timeout milliseconds.
146          * The callback of the timer will send a SubnGet(SMInfo) to the Master SM
147          * and restart the timer
148          */
149         cl_status = cl_timer_start(&sm->polling_timer, timeout);
150         if (cl_status != CL_SUCCESS)
151                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3210: "
152                         "Failed to start timer\n");
153
154         OSM_LOG_EXIT(sm->p_log);
155 }
156
157 /**********************************************************************
158  **********************************************************************/
159 void osm_sm_state_mgr_polling_callback(IN void *context)
160 {
161         osm_sm_t *sm = context;
162         uint32_t timeout = sm->p_subn->opt.sminfo_polling_timeout;
163         cl_status_t cl_status;
164
165         OSM_LOG_ENTER(sm->p_log);
166
167         /*
168          * We can be here in one of two cases:
169          * 1. We are a STANDBY sm polling on the master SM.
170          * 2. We are a MASTER sm, waiting for a handover from a remote master sm.
171          * If we are not in one of these cases - don't need to restart the poller.
172          */
173         if (!((sm->p_subn->sm_state == IB_SMINFO_STATE_MASTER &&
174                sm->p_polling_sm != NULL) ||
175               (sm->p_subn->sm_state == IB_SMINFO_STATE_STANDBY)))
176                 goto Exit;
177
178         /*
179          * If we are a STANDBY sm and the osm_exit_flag is set, then let's
180          * signal the subnet_up. This is relevant for the case of running only
181          * once. In that case - the program is stuck until this signal is
182          * received. In other cases - it is not relevant whether or not the
183          * signal is on - since we are currently in exit flow
184          */
185         if (sm->p_subn->sm_state == IB_SMINFO_STATE_STANDBY && osm_exit_flag) {
186                 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
187                         "Signalling subnet_up_event\n");
188                 cl_event_signal(&sm->subnet_up_event);
189                 goto Exit;
190         }
191
192         /*
193          * Incr the retry number.
194          * If it reached the max_retry_number in the subnet opt - call
195          * osm_sm_state_mgr_process with signal OSM_SM_SIGNAL_POLLING_TIMEOUT
196          */
197         sm->retry_number++;
198         OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
199                 "Retry number:%d\n", sm->retry_number);
200
201         if (sm->retry_number >= sm->p_subn->opt.polling_retry_number) {
202                 OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
203                         "Reached polling_retry_number value in retry_number. "
204                         "Go to DISCOVERY state\n");
205                 osm_sm_state_mgr_process(sm, OSM_SM_SIGNAL_POLLING_TIMEOUT);
206                 goto Exit;
207         }
208
209         /* Send a SubnGet(SMInfo) request to the remote sm (depends on our state) */
210         __osm_sm_state_mgr_send_master_sm_info_req(sm);
211
212         /* restart the timer */
213         cl_status = cl_timer_start(&sm->polling_timer, timeout);
214         if (cl_status != CL_SUCCESS)
215                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3211: "
216                         "Failed to restart timer\n");
217
218 Exit:
219         OSM_LOG_EXIT(sm->p_log);
220         return;
221 }
222
223 /**********************************************************************
224  **********************************************************************/
225 static void __osm_sm_state_mgr_signal_error(osm_sm_t * sm,
226                                             IN const osm_sm_signal_t signal)
227 {
228         OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3207: "
229                 "Invalid signal %s in state %s\n",
230                 osm_get_sm_mgr_signal_str(signal),
231                 osm_get_sm_mgr_state_str(sm->p_subn->sm_state));
232 }
233
234 /**********************************************************************
235  **********************************************************************/
236 void osm_sm_state_mgr_signal_master_is_alive(osm_sm_t * sm)
237 {
238         OSM_LOG_ENTER(sm->p_log);
239         sm->retry_number = 0;
240         OSM_LOG_EXIT(sm->p_log);
241 }
242
243 /**********************************************************************
244  **********************************************************************/
245 ib_api_status_t osm_sm_state_mgr_process(osm_sm_t * sm,
246                                          IN osm_sm_signal_t signal)
247 {
248         ib_api_status_t status = IB_SUCCESS;
249
250         CL_ASSERT(sm);
251
252         OSM_LOG_ENTER(sm->p_log);
253
254         /*
255          * The state lock prevents many race conditions from screwing
256          * up the state transition process.
257          */
258         cl_spinlock_acquire(&sm->state_lock);
259
260         OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
261                 "Received signal %s in state %s\n",
262                 osm_get_sm_mgr_signal_str(signal),
263                 osm_get_sm_mgr_state_str(sm->p_subn->sm_state));
264
265         switch (sm->p_subn->sm_state) {
266         case IB_SMINFO_STATE_DISCOVERING:
267                 switch (signal) {
268                 case OSM_SM_SIGNAL_DISCOVERY_COMPLETED:
269                         /*
270                          * Update the state of the SM to MASTER
271                          */
272                         /* Turn on the first_time_master_sweep flag */
273                         sm->p_subn->first_time_master_sweep = TRUE;
274                         sm->p_subn->sm_state = IB_SMINFO_STATE_MASTER;
275                         osm_report_sm_state(sm);
276                         /*
277                          * Make sure to set the subnet master_sm_base_lid
278                          * to the sm_base_lid value
279                          */
280                         sm->p_subn->master_sm_base_lid =
281                             sm->p_subn->sm_base_lid;
282                         break;
283                 case OSM_SM_SIGNAL_MASTER_OR_HIGHER_SM_DETECTED:
284                         /*
285                          * Finished all discovery actions - move to STANDBY
286                          * start the polling
287                          */
288                         sm->p_subn->sm_state = IB_SMINFO_STATE_STANDBY;
289                         osm_report_sm_state(sm);
290                         /*
291                          * Since another SM is doing the LFT config - we should not
292                          * ignore the results of it
293                          */
294                         sm->p_subn->ignore_existing_lfts = FALSE;
295
296                         __osm_sm_state_mgr_start_polling(sm);
297                         break;
298                 case OSM_SM_SIGNAL_HANDOVER:
299                         /*
300                          * Do nothing. We will discover it later on. If we already discovered
301                          * this SM, and got the HANDOVER - this means the remote SM is of
302                          * lower priority. In this case we will stop polling it (since it is
303                          * a lower priority SM in STANDBY state).
304                          */
305                         break;
306                 default:
307                         __osm_sm_state_mgr_signal_error(sm, signal);
308                         status = IB_INVALID_PARAMETER;
309                         break;
310                 }
311                 break;
312
313         case IB_SMINFO_STATE_STANDBY:
314                 switch (signal) {
315                 case OSM_SM_SIGNAL_POLLING_TIMEOUT:
316                 case OSM_SM_SIGNAL_DISCOVER:
317                         /*
318                          * case 1: Polling timeout occured - this means that the Master SM
319                          * is no longer alive.
320                          * case 2: Got a signal to move to DISCOVERING
321                          * Move to DISCOVERING state and start sweeping
322                          */
323                         sm->p_subn->sm_state = IB_SMINFO_STATE_DISCOVERING;
324                         osm_report_sm_state(sm);
325                         sm->p_subn->coming_out_of_standby = TRUE;
326                         osm_sm_signal(sm, OSM_SIGNAL_SWEEP);
327                         break;
328                 case OSM_SM_SIGNAL_DISABLE:
329                         /*
330                          * Update the state to NOT_ACTIVE
331                          */
332                         sm->p_subn->sm_state = IB_SMINFO_STATE_NOTACTIVE;
333                         osm_report_sm_state(sm);
334                         osm_vendor_set_sm(sm->mad_ctrl.h_bind, FALSE);
335                         break;
336                 case OSM_SM_SIGNAL_HANDOVER:
337                         /*
338                          * Update the state to MASTER, and start sweeping
339                          * OPTIONAL: send ACKNOWLEDGE
340                          */
341                         /* Turn on the first_time_master_sweep flag */
342                         sm->p_subn->first_time_master_sweep = TRUE;
343                         /* Turn on the force_heavy_sweep - we want a
344                          * heavy sweep to occur on the first sweep of this SM. */
345                         sm->p_subn->force_heavy_sweep = TRUE;
346
347                         sm->p_subn->sm_state = IB_SMINFO_STATE_MASTER;
348                         osm_report_sm_state(sm);
349                         /*
350                          * Make sure to set the subnet master_sm_base_lid
351                          * to the sm_base_lid value
352                          */
353                         sm->p_subn->master_sm_base_lid =
354                             sm->p_subn->sm_base_lid;
355                         sm->p_subn->coming_out_of_standby = TRUE;
356                         osm_sm_signal(sm, OSM_SIGNAL_SWEEP);
357                         break;
358                 case OSM_SM_SIGNAL_ACKNOWLEDGE:
359                         /*
360                          * Do nothing - already moved to STANDBY
361                          */
362                         break;
363                 default:
364                         __osm_sm_state_mgr_signal_error(sm, signal);
365                         status = IB_INVALID_PARAMETER;
366                         break;
367                 }
368                 break;
369
370         case IB_SMINFO_STATE_NOTACTIVE:
371                 switch (signal) {
372                 case OSM_SM_SIGNAL_STANDBY:
373                         /*
374                          * Update the state to STANDBY
375                          * start the polling
376                          */
377                         sm->p_subn->sm_state = IB_SMINFO_STATE_STANDBY;
378                         osm_report_sm_state(sm);
379                         __osm_sm_state_mgr_start_polling(sm);
380                         break;
381                 default:
382                         __osm_sm_state_mgr_signal_error(sm, signal);
383                         status = IB_INVALID_PARAMETER;
384                         break;
385                 }
386                 break;
387
388         case IB_SMINFO_STATE_MASTER:
389                 switch (signal) {
390                 case OSM_SM_SIGNAL_POLLING_TIMEOUT:
391                         /*
392                          * we received a polling timeout - this means that we waited for
393                          * a remote master sm to send us a handover, but didn't get it, and
394                          * didn't get a response from that remote sm.
395                          * We want to force a heavy sweep - hopefully this occurred because
396                          * the remote sm died, and we'll find this out and configure the
397                          * subnet after a heavy sweep.
398                          * We also want to clear the p_polling_sm object - since we are
399                          * done polling on that remote sm - we are sweeping again.
400                          */
401                 case OSM_SM_SIGNAL_HANDOVER:
402                         /*
403                          * If we received a handover in a master state - then we want to
404                          * force a heavy sweep. This means that either we are in a sweep
405                          * currently - in this case - no change, or we are in idle state -
406                          * since we recognized a master SM before - so we want to make a
407                          * heavy sweep and reconfigure the new subnet.
408                          * We also want to clear the p_polling_sm object - since we are
409                          * done polling on that remote sm - we got a handover from it.
410                          */
411                         OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
412                                 "Forcing heavy sweep. "
413                                 "Received OSM_SM_SIGNAL_HANDOVER or OSM_SM_SIGNAL_POLLING_TIMEOUT\n");
414                         sm->p_polling_sm = NULL;
415                         sm->p_subn->force_heavy_sweep = TRUE;
416                         osm_sm_signal(sm, OSM_SIGNAL_SWEEP);
417                         break;
418                 case OSM_SM_SIGNAL_HANDOVER_SENT:
419                         /*
420                          * Just sent a HANDOVER signal - move to STANDBY
421                          * start the polling
422                          */
423                         sm->p_subn->sm_state = IB_SMINFO_STATE_STANDBY;
424                         osm_report_sm_state(sm);
425                         __osm_sm_state_mgr_start_polling(sm);
426                         break;
427                 case OSM_SM_SIGNAL_WAIT_FOR_HANDOVER:
428                         /*
429                          * We found a remote master SM, and we are waiting for it
430                          * to handover the mastership to us. Need to start polling
431                          * on that SM, to make sure it is alive, if it isn't - then
432                          * we should move back to discovering, since something must
433                          * have happened to it.
434                          */
435                         __osm_sm_state_mgr_start_polling(sm);
436                         break;
437                 case OSM_SM_SIGNAL_DISCOVER:
438                         sm->p_subn->sm_state = IB_SMINFO_STATE_DISCOVERING;
439                         osm_report_sm_state(sm);
440                         break;
441                 default:
442                         __osm_sm_state_mgr_signal_error(sm, signal);
443                         status = IB_INVALID_PARAMETER;
444                         break;
445                 }
446                 break;
447
448         default:
449                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3208: "
450                         "Invalid state %s\n",
451                         osm_get_sm_mgr_state_str(sm->p_subn->sm_state));
452
453         }
454
455         cl_spinlock_release(&sm->state_lock);
456
457         OSM_LOG_EXIT(sm->p_log);
458         return (status);
459 }
460
461 /**********************************************************************
462  **********************************************************************/
463 ib_api_status_t osm_sm_state_mgr_check_legality(osm_sm_t * sm,
464                                                 IN osm_sm_signal_t signal)
465 {
466         ib_api_status_t status = IB_SUCCESS;
467
468         CL_ASSERT(sm);
469
470         OSM_LOG_ENTER(sm->p_log);
471
472         /*
473          * The state lock prevents many race conditions from screwing
474          * up the state transition process.
475          */
476         cl_spinlock_acquire(&sm->state_lock);
477
478         OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
479                 "Received signal %s in state %s\n",
480                 osm_get_sm_mgr_signal_str(signal),
481                 osm_get_sm_mgr_state_str(sm->p_subn->sm_state));
482
483         switch (sm->p_subn->sm_state) {
484         case IB_SMINFO_STATE_DISCOVERING:
485                 switch (signal) {
486                 case OSM_SM_SIGNAL_DISCOVERY_COMPLETED:
487                 case OSM_SM_SIGNAL_MASTER_OR_HIGHER_SM_DETECTED:
488                 case OSM_SM_SIGNAL_HANDOVER:
489                         status = IB_SUCCESS;
490                         break;
491                 default:
492                         __osm_sm_state_mgr_signal_error(sm, signal);
493                         status = IB_INVALID_PARAMETER;
494                         break;
495                 }
496                 break;
497
498         case IB_SMINFO_STATE_STANDBY:
499                 switch (signal) {
500                 case OSM_SM_SIGNAL_POLLING_TIMEOUT:
501                 case OSM_SM_SIGNAL_DISCOVER:
502                 case OSM_SM_SIGNAL_DISABLE:
503                 case OSM_SM_SIGNAL_HANDOVER:
504                 case OSM_SM_SIGNAL_ACKNOWLEDGE:
505                         status = IB_SUCCESS;
506                         break;
507                 default:
508                         __osm_sm_state_mgr_signal_error(sm, signal);
509                         status = IB_INVALID_PARAMETER;
510                         break;
511                 }
512                 break;
513
514         case IB_SMINFO_STATE_NOTACTIVE:
515                 switch (signal) {
516                 case OSM_SM_SIGNAL_STANDBY:
517                         status = IB_SUCCESS;
518                         break;
519                 default:
520                         __osm_sm_state_mgr_signal_error(sm, signal);
521                         status = IB_INVALID_PARAMETER;
522                         break;
523                 }
524                 break;
525
526         case IB_SMINFO_STATE_MASTER:
527                 switch (signal) {
528                 case OSM_SM_SIGNAL_HANDOVER:
529                 case OSM_SM_SIGNAL_HANDOVER_SENT:
530                         status = IB_SUCCESS;
531                         break;
532                 default:
533                         __osm_sm_state_mgr_signal_error(sm, signal);
534                         status = IB_INVALID_PARAMETER;
535                         break;
536                 }
537                 break;
538
539         default:
540                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3209: "
541                         "Invalid state %s\n",
542                         osm_get_sm_mgr_state_str(sm->p_subn->sm_state));
543                 status = IB_INVALID_PARAMETER;
544
545         }
546
547         cl_spinlock_release(&sm->state_lock);
548
549         OSM_LOG_EXIT(sm->p_log);
550         return (status);
551 }