2 * Copyright (c) 2007 The Regents of the University of California.
3 * Copyright (c) 2007-2008 Voltaire, Inc. All rights reserved.
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37 * Implementation of osm_perfmgr_t.
38 * This object implements an IBA performance manager.
46 #endif /* HAVE_CONFIG_H */
48 #ifdef ENABLE_OSM_PERF_MGR
56 #include <netinet/in.h>
58 #include <arpa/inet.h>
59 #include <iba/ib_types.h>
60 #include <complib/cl_debug.h>
61 #include <complib/cl_thread.h>
62 #include <vendor/osm_vendor_api.h>
63 #include <opensm/osm_perfmgr.h>
64 #include <opensm/osm_log.h>
65 #include <opensm/osm_node.h>
66 #include <opensm/osm_opensm.h>
68 #define OSM_PERFMGR_INITIAL_TID_VALUE 0xcafe
70 #if ENABLE_OSM_PERF_MGR_PROFILE
76 } perfmgr_mad_stats = {
83 /* diff must be something which can fit in a susecond_t */
84 static inline void update_mad_stats(struct timeval *diff)
86 double new = (diff->tv_sec * 1000000) + diff->tv_usec;
87 if (new < perfmgr_mad_stats.fastest_us)
88 perfmgr_mad_stats.fastest_us = new;
89 if (new > perfmgr_mad_stats.slowest_us)
90 perfmgr_mad_stats.slowest_us = new;
92 perfmgr_mad_stats.avg_us =
93 ((perfmgr_mad_stats.avg_us * perfmgr_mad_stats.num) + new)
94 / (perfmgr_mad_stats.num + 1);
95 perfmgr_mad_stats.num++;
98 static inline void perfmgr_clear_mad_stats(void)
100 perfmgr_mad_stats.fastest_us = DBL_MAX;
101 perfmgr_mad_stats.slowest_us = DBL_MIN;
102 perfmgr_mad_stats.avg_us = 0;
103 perfmgr_mad_stats.num = 0;
106 /* after and diff can be the same struct */
107 static inline void diff_time(struct timeval *before,
108 struct timeval *after, struct timeval *diff)
110 struct timeval tmp = *after;
111 if (tmp.tv_usec < before->tv_usec) {
113 tmp.tv_usec += 1000000;
115 diff->tv_sec = tmp.tv_sec - before->tv_sec;
116 diff->tv_usec = tmp.tv_usec - before->tv_usec;
121 extern int wait_for_pending_transactions(osm_stats_t * stats);
123 /**********************************************************************
124 * Internal helper functions.
125 **********************************************************************/
126 static inline void __init_monitored_nodes(osm_perfmgr_t * pm)
128 cl_qmap_init(&pm->monitored_map);
129 pm->remove_list = NULL;
130 cl_event_construct(&pm->sig_query);
131 cl_event_init(&pm->sig_query, FALSE);
135 __mark_for_removal(osm_perfmgr_t * pm, __monitored_node_t * node)
137 if (pm->remove_list) {
138 node->next = pm->remove_list;
139 pm->remove_list = node;
142 pm->remove_list = node;
146 static inline void __remove_marked_nodes(osm_perfmgr_t * pm)
148 while (pm->remove_list) {
149 __monitored_node_t *next = pm->remove_list->next;
151 cl_qmap_remove_item(&(pm->monitored_map),
152 (cl_map_item_t *) (pm->remove_list));
154 if (pm->remove_list->name)
155 free(pm->remove_list->name);
156 free(pm->remove_list);
157 pm->remove_list = next;
161 static inline void __decrement_outstanding_queries(osm_perfmgr_t * pm)
163 cl_atomic_dec(&(pm->outstanding_queries));
164 cl_event_signal(&(pm->sig_query));
167 /**********************************************************************
168 * Receive the MAD from the vendor layer and post it for processing by
170 **********************************************************************/
172 osm_perfmgr_mad_recv_callback(osm_madw_t * p_madw, void *bind_context,
173 osm_madw_t * p_req_madw)
175 osm_perfmgr_t *pm = (osm_perfmgr_t *) bind_context;
177 OSM_LOG_ENTER(pm->log);
179 osm_madw_copy_context(p_madw, p_req_madw);
180 osm_mad_pool_put(pm->mad_pool, p_req_madw);
182 __decrement_outstanding_queries(pm);
184 /* post this message for later processing. */
185 if (cl_disp_post(pm->pc_disp_h, OSM_MSG_MAD_PORT_COUNTERS,
186 (void *)p_madw, NULL, NULL) != CL_SUCCESS) {
187 OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C01: "
188 "PerfMgr Dispatcher post failed\n");
189 osm_mad_pool_put(pm->mad_pool, p_madw);
191 OSM_LOG_EXIT(pm->log);
194 /**********************************************************************
195 * Process MAD send errors.
196 **********************************************************************/
198 osm_perfmgr_mad_send_err_callback(void *bind_context, osm_madw_t * p_madw)
200 osm_perfmgr_t *pm = (osm_perfmgr_t *) bind_context;
201 osm_madw_context_t *context = &(p_madw->context);
202 uint64_t node_guid = context->perfmgr_context.node_guid;
203 uint8_t port = context->perfmgr_context.port;
204 cl_map_item_t *p_node;
205 __monitored_node_t *p_mon_node;
207 OSM_LOG_ENTER(pm->log);
209 /* go ahead and get the monitored node struct to have the printable
210 * name if needed in messages
212 if ((p_node = cl_qmap_get(&(pm->monitored_map), node_guid)) ==
213 cl_qmap_end(&(pm->monitored_map))) {
214 OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C15: GUID 0x%016"
215 PRIx64 " not found in monitored map\n",
219 p_mon_node = (__monitored_node_t *) p_node;
221 OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C02: %s (0x%" PRIx64
222 ") port %u\n", p_mon_node->name, p_mon_node->guid, port);
224 if (pm->subn->opt.perfmgr_redir && p_madw->status == IB_TIMEOUT) {
225 /* First, find the node in the monitored map */
226 cl_plock_acquire(pm->lock);
227 /* Now, validate port number */
228 if (port > p_mon_node->redir_tbl_size) {
229 cl_plock_release(pm->lock);
230 OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C16: "
231 "Invalid port num %u for %s (GUID 0x%016"
232 PRIx64 ") num ports %u\n", port, p_mon_node->name,
233 p_mon_node->guid, p_mon_node->redir_tbl_size);
236 /* Clear redirection info */
237 p_mon_node->redir_port[port].redir_lid = 0;
238 p_mon_node->redir_port[port].redir_qp = 0;
239 cl_plock_release(pm->lock);
243 osm_mad_pool_put(pm->mad_pool, p_madw);
245 __decrement_outstanding_queries(pm);
247 OSM_LOG_EXIT(pm->log);
250 /**********************************************************************
251 * Bind the PerfMgr to the vendor layer for MAD sends/receives
252 **********************************************************************/
254 osm_perfmgr_bind(osm_perfmgr_t * const pm, const ib_net64_t port_guid)
256 osm_bind_info_t bind_info;
257 ib_api_status_t status = IB_SUCCESS;
259 OSM_LOG_ENTER(pm->log);
261 if (pm->bind_handle != OSM_BIND_INVALID_HANDLE) {
262 OSM_LOG(pm->log, OSM_LOG_ERROR,
263 "ERR 4C03: Multiple binds not allowed\n");
268 bind_info.port_guid = port_guid;
269 bind_info.mad_class = IB_MCLASS_PERF;
270 bind_info.class_version = 1;
271 bind_info.is_responder = FALSE;
272 bind_info.is_report_processor = FALSE;
273 bind_info.is_trap_processor = FALSE;
274 bind_info.recv_q_size = OSM_PM_DEFAULT_QP1_RCV_SIZE;
275 bind_info.send_q_size = OSM_PM_DEFAULT_QP1_SEND_SIZE;
277 OSM_LOG(pm->log, OSM_LOG_VERBOSE,
278 "Binding to port GUID 0x%" PRIx64 "\n", cl_ntoh64(port_guid));
280 pm->bind_handle = osm_vendor_bind(pm->vendor,
283 osm_perfmgr_mad_recv_callback,
284 osm_perfmgr_mad_send_err_callback,
287 if (pm->bind_handle == OSM_BIND_INVALID_HANDLE) {
289 OSM_LOG(pm->log, OSM_LOG_ERROR,
290 "ERR 4C04: Vendor specific bind failed (%s)\n",
291 ib_get_err_str(status));
296 OSM_LOG_EXIT(pm->log);
300 /**********************************************************************
301 * Unbind the PerfMgr from the vendor layer for MAD sends/receives
302 **********************************************************************/
303 static void osm_perfmgr_mad_unbind(osm_perfmgr_t * const pm)
305 OSM_LOG_ENTER(pm->log);
306 if (pm->bind_handle == OSM_BIND_INVALID_HANDLE) {
307 OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C05: No previous bind\n");
310 osm_vendor_unbind(pm->bind_handle);
312 OSM_LOG_EXIT(pm->log);
315 /**********************************************************************
316 * Given a monitored node and a port, return the qp
317 **********************************************************************/
318 static ib_net32_t get_qp(__monitored_node_t * mon_node, uint8_t port)
320 ib_net32_t qp = cl_ntoh32(1);
322 if (mon_node && mon_node->redir_tbl_size &&
323 port < mon_node->redir_tbl_size &&
324 mon_node->redir_port[port].redir_lid &&
325 mon_node->redir_port[port].redir_qp)
326 qp = mon_node->redir_port[port].redir_qp;
331 /**********************************************************************
332 * Given a node, a port, and an optional monitored node,
333 * return the appropriate lid to query that port
334 **********************************************************************/
336 get_lid(osm_node_t * p_node, uint8_t port, __monitored_node_t * mon_node)
338 if (mon_node && mon_node->redir_tbl_size &&
339 port < mon_node->redir_tbl_size &&
340 mon_node->redir_port[port].redir_lid)
341 return mon_node->redir_port[port].redir_lid;
343 switch (p_node->node_info.node_type) {
344 case IB_NODE_TYPE_CA:
345 case IB_NODE_TYPE_ROUTER:
346 return osm_node_get_base_lid(p_node, port);
347 case IB_NODE_TYPE_SWITCH:
348 return osm_node_get_base_lid(p_node, 0);
354 /**********************************************************************
355 * Form and send the Port Counters MAD for a single port.
356 **********************************************************************/
357 static ib_api_status_t
358 osm_perfmgr_send_pc_mad(osm_perfmgr_t * perfmgr, ib_net16_t dest_lid,
359 ib_net32_t dest_qp, uint8_t port, uint8_t mad_method,
360 osm_madw_context_t * const p_context)
362 ib_api_status_t status = IB_SUCCESS;
363 ib_port_counters_t *port_counter = NULL;
364 ib_perfmgt_mad_t *pm_mad = NULL;
365 osm_madw_t *p_madw = NULL;
367 OSM_LOG_ENTER(perfmgr->log);
370 osm_mad_pool_get(perfmgr->mad_pool, perfmgr->bind_handle,
371 MAD_BLOCK_SIZE, NULL);
373 return (IB_INSUFFICIENT_MEMORY);
375 pm_mad = osm_madw_get_perfmgt_mad_ptr(p_madw);
378 pm_mad->header.base_ver = 1;
379 pm_mad->header.mgmt_class = IB_MCLASS_PERF;
380 pm_mad->header.class_ver = 1;
381 pm_mad->header.method = mad_method;
382 pm_mad->header.status = 0;
383 pm_mad->header.class_spec = 0;
384 pm_mad->header.trans_id =
385 cl_hton64((uint64_t) cl_atomic_inc(&(perfmgr->trans_id)));
386 pm_mad->header.attr_id = IB_MAD_ATTR_PORT_CNTRS;
387 pm_mad->header.resv = 0;
388 pm_mad->header.attr_mod = 0;
390 port_counter = (ib_port_counters_t *) & (pm_mad->data);
391 memset(port_counter, 0, sizeof(*port_counter));
392 port_counter->port_select = port;
393 port_counter->counter_select = 0xFFFF;
395 p_madw->mad_addr.dest_lid = dest_lid;
396 p_madw->mad_addr.addr_type.gsi.remote_qp = dest_qp;
397 p_madw->mad_addr.addr_type.gsi.remote_qkey =
398 cl_hton32(IB_QP1_WELL_KNOWN_Q_KEY);
399 /* FIXME what about other partitions */
400 p_madw->mad_addr.addr_type.gsi.pkey_ix = 0;
401 p_madw->mad_addr.addr_type.gsi.service_level = 0;
402 p_madw->mad_addr.addr_type.gsi.global_route = FALSE;
403 p_madw->resp_expected = TRUE;
406 p_madw->context = *p_context;
408 status = osm_vendor_send(perfmgr->bind_handle, p_madw, TRUE);
410 if (status == IB_SUCCESS) {
411 /* pause this thread if we have too many outstanding requests */
412 cl_atomic_inc(&(perfmgr->outstanding_queries));
413 if (perfmgr->outstanding_queries >
414 perfmgr->max_outstanding_queries) {
415 perfmgr->sweep_state = PERFMGR_SWEEP_SUSPENDED;
416 cl_event_wait_on(&perfmgr->sig_query, EVENT_NO_TIMEOUT,
418 perfmgr->sweep_state = PERFMGR_SWEEP_ACTIVE;
422 OSM_LOG_EXIT(perfmgr->log);
426 /**********************************************************************
427 * sweep the node_guid_tbl and collect the node guids to be tracked
428 **********************************************************************/
429 static void __collect_guids(cl_map_item_t * const p_map_item, void *context)
431 osm_node_t *node = (osm_node_t *) p_map_item;
432 uint64_t node_guid = cl_ntoh64(node->node_info.node_guid);
433 osm_perfmgr_t *pm = (osm_perfmgr_t *) context;
434 __monitored_node_t *mon_node = NULL;
437 OSM_LOG_ENTER(pm->log);
439 if (cl_qmap_get(&(pm->monitored_map), node_guid)
440 == cl_qmap_end(&(pm->monitored_map))) {
441 /* if not already in our map add it */
442 size = node->node_info.num_ports;
443 mon_node = malloc(sizeof(*mon_node) + sizeof(redir_t) * size);
445 OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 4C06: "
446 "malloc failed: not handling node %s"
447 "(GUID 0x%" PRIx64 ")\n", node->print_desc, node_guid);
450 memset(mon_node, 0, sizeof(*mon_node) + sizeof(redir_t) * size);
451 mon_node->guid = node_guid;
452 mon_node->name = strdup(node->print_desc);
453 mon_node->redir_tbl_size = size + 1;
454 cl_qmap_insert(&(pm->monitored_map), node_guid,
455 (cl_map_item_t *) mon_node);
459 OSM_LOG_EXIT(pm->log);
462 /**********************************************************************
463 * query the Port Counters of all the nodes in the subnet.
464 **********************************************************************/
466 __osm_perfmgr_query_counters(cl_map_item_t * const p_map_item, void *context)
468 ib_api_status_t status = IB_SUCCESS;
469 uint8_t port = 0, startport = 1;
470 osm_perfmgr_t *pm = (osm_perfmgr_t *) context;
471 osm_node_t *node = NULL;
472 __monitored_node_t *mon_node = (__monitored_node_t *) p_map_item;
473 osm_madw_context_t mad_context;
474 uint8_t num_ports = 0;
475 uint64_t node_guid = 0;
476 ib_net32_t remote_qp;
478 OSM_LOG_ENTER(pm->log);
480 cl_plock_acquire(pm->lock);
481 node = osm_get_node_by_guid(pm->subn, cl_hton64(mon_node->guid));
483 OSM_LOG(pm->log, OSM_LOG_ERROR,
484 "ERR 4C07: Node \"%s\" (guid 0x%" PRIx64
485 ") no longer exists so removing from PerfMgr monitoring\n",
486 mon_node->name, mon_node->guid);
487 __mark_for_removal(pm, mon_node);
491 num_ports = osm_node_get_num_physp(node);
492 node_guid = cl_ntoh64(node->node_info.node_guid);
494 /* make sure we have a database object ready to store this information */
495 if (perfmgr_db_create_entry(pm->db, node_guid, num_ports,
497 PERFMGR_EVENT_DB_SUCCESS) {
498 OSM_LOG(pm->log, OSM_LOG_ERROR,
499 "ERR 4C08: DB create entry failed for 0x%"
500 PRIx64 " (%s) : %s\n", node_guid, node->print_desc,
505 /* if switch, check for enhanced port 0 */
506 if (osm_node_get_type(node) == IB_NODE_TYPE_SWITCH &&
508 ib_switch_info_is_enhanced_port0(&node->sw->switch_info))
511 /* issue the query for each port */
512 for (port = startport; port < num_ports; port++) {
515 if (!osm_node_get_physp_ptr(node, port))
518 lid = get_lid(node, port, mon_node);
520 OSM_LOG(pm->log, OSM_LOG_DEBUG, "WARN: node 0x%" PRIx64
521 " port %d (%s): port out of range, skipping\n",
522 cl_ntoh64(node->node_info.node_guid), port,
527 remote_qp = get_qp(mon_node, port);
529 mad_context.perfmgr_context.node_guid = node_guid;
530 mad_context.perfmgr_context.port = port;
531 mad_context.perfmgr_context.mad_method = IB_MAD_METHOD_GET;
532 #if ENABLE_OSM_PERF_MGR_PROFILE
533 gettimeofday(&(mad_context.perfmgr_context.query_start), NULL);
535 OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Getting stats for node 0x%"
536 PRIx64 " port %d (lid %u) (%s)\n", node_guid, port,
537 cl_ntoh16(lid), node->print_desc);
539 osm_perfmgr_send_pc_mad(pm, lid, remote_qp, port,
540 IB_MAD_METHOD_GET, &mad_context);
541 if (status != IB_SUCCESS)
542 OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C09: "
543 "Failed to issue port counter query for node 0x%"
544 PRIx64 " port %d (%s)\n",
545 node->node_info.node_guid, port,
549 cl_plock_release(pm->lock);
550 OSM_LOG_EXIT(pm->log);
553 /**********************************************************************
555 * Basically this code should not be here, but merged with main OpenSM
556 **********************************************************************/
557 extern void osm_drop_mgr_process(IN osm_sm_t *sm);
559 static int sweep_hop_1(osm_sm_t * sm)
561 ib_api_status_t status = IB_SUCCESS;
562 osm_bind_handle_t h_bind;
563 osm_madw_context_t context;
566 osm_physp_t *p_physp;
567 osm_dr_path_t *p_dr_path;
568 osm_dr_path_t hop_1_path;
569 ib_net64_t port_guid;
571 uint8_t path_array[IB_SUBNET_PATH_HOPS_MAX];
573 osm_physp_t *p_ext_physp;
575 port_guid = sm->p_subn->sm_port_guid;
577 p_port = osm_get_port_by_guid(sm->p_subn, port_guid);
579 OSM_LOG(sm->p_log, OSM_LOG_ERROR,
580 "ERR 4C81: No SM port object\n");
584 p_node = p_port->p_node;
585 port_num = ib_node_info_get_local_port_num(&p_node->node_info);
587 OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
588 "Probing hop 1 on local port %u\n", port_num);
590 p_physp = osm_node_get_physp_ptr(p_node, port_num);
594 p_dr_path = osm_physp_get_dr_path_ptr(p_physp);
595 h_bind = osm_dr_path_get_bind_handle(p_dr_path);
597 CL_ASSERT(h_bind != OSM_BIND_INVALID_HANDLE);
599 memset(path_array, 0, sizeof(path_array));
600 /* the hop_1 operations depend on the type of our node.
601 * Currently - legal nodes that can host SM are SW and CA */
602 switch (osm_node_get_type(p_node)) {
603 case IB_NODE_TYPE_CA:
604 case IB_NODE_TYPE_ROUTER:
605 memset(&context, 0, sizeof(context));
606 context.ni_context.node_guid = osm_node_get_node_guid(p_node);
607 context.ni_context.port_num = port_num;
609 path_array[1] = port_num;
611 osm_dr_path_init(&hop_1_path, h_bind, 1, path_array);
612 status = osm_req_get(sm, &hop_1_path,
613 IB_MAD_ATTR_NODE_INFO, 0,
614 CL_DISP_MSGID_NONE, &context);
616 if (status != IB_SUCCESS)
617 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 4C82: "
618 "Request for NodeInfo failed\n");
621 case IB_NODE_TYPE_SWITCH:
622 /* Need to go over all the ports of the switch, and send a node_info
623 * from them. This doesn't include the port 0 of the switch, which
625 * Note: We'll send another switchInfo on port 0, since if no ports
626 * are connected, we still want to get some response, and have the
629 num_ports = osm_node_get_num_physp(p_node);
630 for (port_num = 0; port_num < num_ports; port_num++) {
631 /* go through the port only if the port is not DOWN */
632 p_ext_physp = osm_node_get_physp_ptr(p_node, port_num);
633 if (!p_ext_physp || ib_port_info_get_port_state
634 (&p_ext_physp->port_info) <= IB_LINK_DOWN)
637 memset(&context, 0, sizeof(context));
638 context.ni_context.node_guid =
639 osm_node_get_node_guid(p_node);
640 context.ni_context.port_num = port_num;
642 path_array[1] = port_num;
644 osm_dr_path_init(&hop_1_path, h_bind, 1, path_array);
645 status = osm_req_get(sm, &hop_1_path,
646 IB_MAD_ATTR_NODE_INFO, 0,
647 CL_DISP_MSGID_NONE, &context);
649 if (status != IB_SUCCESS)
650 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 4C82: "
651 "Request for NodeInfo failed\n");
656 OSM_LOG(sm->p_log, OSM_LOG_ERROR,
657 "ERR 4C83: Unknown node type %d\n",
658 osm_node_get_type(p_node));
664 static unsigned is_sm_port_down(osm_sm_t * const sm)
666 ib_net64_t port_guid;
669 port_guid = sm->p_subn->sm_port_guid;
673 CL_PLOCK_ACQUIRE(sm->p_lock);
674 p_port = osm_get_port_by_guid(sm->p_subn, port_guid);
676 CL_PLOCK_RELEASE(sm->p_lock);
677 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 4C85: "
678 "SM port with GUID:%016" PRIx64 " is unknown\n",
679 cl_ntoh64(port_guid));
682 CL_PLOCK_RELEASE(sm->p_lock);
684 return osm_physp_get_port_state(p_port->p_physp) == IB_LINK_DOWN;
687 static int sweep_hop_0(osm_sm_t * const sm)
689 ib_api_status_t status;
690 osm_dr_path_t dr_path;
691 osm_bind_handle_t h_bind;
692 uint8_t path_array[IB_SUBNET_PATH_HOPS_MAX];
694 memset(path_array, 0, sizeof(path_array));
696 h_bind = osm_sm_mad_ctrl_get_bind_handle(&sm->mad_ctrl);
697 if (h_bind == OSM_BIND_INVALID_HANDLE) {
698 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "No bound ports.\n");
702 osm_dr_path_init(&dr_path, h_bind, 0, path_array);
703 status = osm_req_get(sm, &dr_path, IB_MAD_ATTR_NODE_INFO, 0,
704 CL_DISP_MSGID_NONE, NULL);
706 if (status != IB_SUCCESS)
707 OSM_LOG(sm->p_log, OSM_LOG_ERROR,
708 "ERR 4C86: Request for NodeInfo failed\n");
713 static void reset_node_count(cl_map_item_t * const p_map_item, void *cxt)
715 osm_node_t *p_node = (osm_node_t *) p_map_item;
716 p_node->discovery_count = 0;
719 static void reset_port_count(cl_map_item_t * const p_map_item, void *cxt)
721 osm_port_t *p_port = (osm_port_t *) p_map_item;
722 p_port->discovery_count = 0;
725 static void reset_switch_count(cl_map_item_t * const p_map_item, void *cxt)
727 osm_switch_t *p_sw = (osm_switch_t *) p_map_item;
728 p_sw->discovery_count = 0;
729 p_sw->need_update = 0;
732 static int perfmgr_discovery(osm_opensm_t * osm)
736 CL_PLOCK_ACQUIRE(&osm->lock);
737 cl_qmap_apply_func(&osm->subn.node_guid_tbl, reset_node_count, NULL);
738 cl_qmap_apply_func(&osm->subn.port_guid_tbl, reset_port_count, NULL);
739 cl_qmap_apply_func(&osm->subn.sw_guid_tbl, reset_switch_count, NULL);
740 CL_PLOCK_RELEASE(&osm->lock);
742 osm->subn.in_sweep_hop_0 = TRUE;
744 ret = sweep_hop_0(&osm->sm);
748 if (wait_for_pending_transactions(&osm->stats))
751 if (is_sm_port_down(&osm->sm)) {
752 OSM_LOG(&osm->log, OSM_LOG_VERBOSE, "SM port is down\n");
756 osm->subn.in_sweep_hop_0 = FALSE;
758 ret = sweep_hop_1(&osm->sm);
762 if (wait_for_pending_transactions(&osm->stats))
766 osm_drop_mgr_process(&osm->sm);
772 /**********************************************************************
773 * Main PerfMgr processor - query the performance counters.
774 **********************************************************************/
775 void osm_perfmgr_process(osm_perfmgr_t * pm)
777 #if ENABLE_OSM_PERF_MGR_PROFILE
778 struct timeval before, after;
781 if (pm->state != PERFMGR_STATE_ENABLED)
784 if (pm->subn->sm_state == IB_SMINFO_STATE_STANDBY ||
785 pm->subn->sm_state == IB_SMINFO_STATE_NOTACTIVE)
786 perfmgr_discovery(pm->subn->p_osm);
788 #if ENABLE_OSM_PERF_MGR_PROFILE
789 gettimeofday(&before, NULL);
791 pm->sweep_state = PERFMGR_SWEEP_ACTIVE;
792 /* With the global lock held collect the node guids */
793 /* FIXME we should be able to track SA notices
794 * and not have to sweep the node_guid_tbl each pass
796 OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Gathering PerfMgr stats\n");
797 cl_plock_acquire(pm->lock);
798 cl_qmap_apply_func(&(pm->subn->node_guid_tbl),
799 __collect_guids, (void *)pm);
800 cl_plock_release(pm->lock);
802 /* then for each node query their counters */
803 cl_qmap_apply_func(&(pm->monitored_map),
804 __osm_perfmgr_query_counters, (void *)pm);
806 /* Clean out any nodes found to be removed during the
809 __remove_marked_nodes(pm);
811 #if ENABLE_OSM_PERF_MGR_PROFILE
812 /* spin on outstanding queries */
813 while (pm->outstanding_queries > 0)
814 cl_event_wait_on(&pm->sig_sweep, 1000, TRUE);
816 gettimeofday(&after, NULL);
817 diff_time(&before, &after, &after);
818 osm_log(pm->log, OSM_LOG_INFO,
819 "PerfMgr total sweep time : %ld.%06ld s\n"
820 " fastest mad : %g us\n"
821 " slowest mad : %g us\n"
822 " average mad : %g us\n",
823 after.tv_sec, after.tv_usec,
824 perfmgr_mad_stats.fastest_us,
825 perfmgr_mad_stats.slowest_us, perfmgr_mad_stats.avg_us);
826 perfmgr_clear_mad_stats();
829 pm->sweep_state = PERFMGR_SWEEP_SLEEP;
832 /**********************************************************************
833 * PerfMgr timer - loop continuously and signal SM to run PerfMgr
835 **********************************************************************/
836 static void perfmgr_sweep(void *arg)
838 osm_perfmgr_t *pm = arg;
840 if (pm->state == PERFMGR_STATE_ENABLED)
841 osm_sm_signal(pm->sm, OSM_SIGNAL_PERFMGR_SWEEP);
842 cl_timer_start(&pm->sweep_timer, pm->sweep_time_s * 1000);
845 /**********************************************************************
846 **********************************************************************/
847 void osm_perfmgr_shutdown(osm_perfmgr_t * const pm)
849 OSM_LOG_ENTER(pm->log);
850 cl_timer_stop(&pm->sweep_timer);
851 osm_perfmgr_mad_unbind(pm);
852 OSM_LOG_EXIT(pm->log);
855 /**********************************************************************
856 **********************************************************************/
857 void osm_perfmgr_destroy(osm_perfmgr_t * const pm)
859 OSM_LOG_ENTER(pm->log);
860 perfmgr_db_destroy(pm->db);
861 cl_timer_destroy(&pm->sweep_timer);
862 OSM_LOG_EXIT(pm->log);
865 /**********************************************************************
866 * Detect if someone else on the network could have cleared the counters
867 * without us knowing. This is easy to detect because the counters never wrap
870 * The one time this will not work is if the port is getting errors fast enough
871 * to have the reading overtake the previous reading. In this case counters
873 **********************************************************************/
875 osm_perfmgr_check_oob_clear(osm_perfmgr_t * pm, __monitored_node_t *mon_node,
876 uint8_t port, perfmgr_db_err_reading_t * cr,
877 perfmgr_db_data_cnt_reading_t * dc)
879 perfmgr_db_err_reading_t prev_err;
880 perfmgr_db_data_cnt_reading_t prev_dc;
882 if (perfmgr_db_get_prev_err(pm->db, mon_node->guid, port, &prev_err)
883 != PERFMGR_EVENT_DB_SUCCESS) {
884 OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Failed to find previous "
885 "error reading for %s (guid 0x%" PRIx64 ") port %u\n",
886 mon_node->name, mon_node->guid, port);
890 if (cr->symbol_err_cnt < prev_err.symbol_err_cnt ||
891 cr->link_err_recover < prev_err.link_err_recover ||
892 cr->link_downed < prev_err.link_downed ||
893 cr->rcv_err < prev_err.rcv_err ||
894 cr->rcv_rem_phys_err < prev_err.rcv_rem_phys_err ||
895 cr->rcv_switch_relay_err < prev_err.rcv_switch_relay_err ||
896 cr->xmit_discards < prev_err.xmit_discards ||
897 cr->xmit_constraint_err < prev_err.xmit_constraint_err ||
898 cr->rcv_constraint_err < prev_err.rcv_constraint_err ||
899 cr->link_integrity < prev_err.link_integrity ||
900 cr->buffer_overrun < prev_err.buffer_overrun ||
901 cr->vl15_dropped < prev_err.vl15_dropped) {
902 OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 4C0A: "
903 "Detected an out of band error clear "
904 "on %s (0x%" PRIx64 ") port %u\n",
905 mon_node->name, mon_node->guid, port);
906 perfmgr_db_clear_prev_err(pm->db, mon_node->guid, port);
909 /* FIXME handle extended counters */
910 if (perfmgr_db_get_prev_dc(pm->db, mon_node->guid, port, &prev_dc)
911 != PERFMGR_EVENT_DB_SUCCESS) {
912 OSM_LOG(pm->log, OSM_LOG_VERBOSE,
913 "Failed to find previous data count "
914 "reading for %s (0x%" PRIx64 ") port %u\n",
915 mon_node->name, mon_node->guid, port);
919 if (dc->xmit_data < prev_dc.xmit_data ||
920 dc->rcv_data < prev_dc.rcv_data ||
921 dc->xmit_pkts < prev_dc.xmit_pkts ||
922 dc->rcv_pkts < prev_dc.rcv_pkts) {
923 OSM_LOG(pm->log, OSM_LOG_ERROR,
924 "PerfMgr: ERR 4C0B: Detected an out of band data counter "
925 "clear on node %s (0x%" PRIx64 ") port %u\n",
926 mon_node->name, mon_node->guid, port);
927 perfmgr_db_clear_prev_dc(pm->db, mon_node->guid, port);
931 /**********************************************************************
932 * Return 1 if the value is "close" to overflowing
933 **********************************************************************/
934 static int counter_overflow_4(uint8_t val)
939 static int counter_overflow_8(uint8_t val)
941 return (val >= (UINT8_MAX - (UINT8_MAX / 4)));
944 static int counter_overflow_16(ib_net16_t val)
946 return (cl_ntoh16(val) >= (UINT16_MAX - (UINT16_MAX / 4)));
949 static int counter_overflow_32(ib_net32_t val)
951 return (cl_ntoh32(val) >= (UINT32_MAX - (UINT32_MAX / 4)));
954 /**********************************************************************
955 * Check if the port counters have overflowed and if so issue a clear
957 **********************************************************************/
959 osm_perfmgr_check_overflow(osm_perfmgr_t * pm, __monitored_node_t *mon_node,
960 uint8_t port, ib_port_counters_t * pc)
962 osm_madw_context_t mad_context;
963 ib_api_status_t status;
964 ib_net32_t remote_qp;
966 OSM_LOG_ENTER(pm->log);
968 if (counter_overflow_16(pc->symbol_err_cnt) ||
969 counter_overflow_8(pc->link_err_recover) ||
970 counter_overflow_8(pc->link_downed) ||
971 counter_overflow_16(pc->rcv_err) ||
972 counter_overflow_16(pc->rcv_rem_phys_err) ||
973 counter_overflow_16(pc->rcv_switch_relay_err) ||
974 counter_overflow_16(pc->xmit_discards) ||
975 counter_overflow_8(pc->xmit_constraint_err) ||
976 counter_overflow_8(pc->rcv_constraint_err) ||
977 counter_overflow_4(PC_LINK_INT(pc->link_int_buffer_overrun)) ||
978 counter_overflow_4(PC_BUF_OVERRUN(pc->link_int_buffer_overrun)) ||
979 counter_overflow_16(pc->vl15_dropped) ||
980 counter_overflow_32(pc->xmit_data) ||
981 counter_overflow_32(pc->rcv_data) ||
982 counter_overflow_32(pc->xmit_pkts) ||
983 counter_overflow_32(pc->rcv_pkts)) {
984 osm_node_t *p_node = NULL;
987 osm_log(pm->log, OSM_LOG_VERBOSE,
988 "PerfMgr: Counter overflow: %s (0x%" PRIx64
989 ") port %d; clearing counters\n",
990 mon_node->name, mon_node->guid, port);
992 cl_plock_acquire(pm->lock);
993 p_node = osm_get_node_by_guid(pm->subn, cl_hton64(mon_node->guid));
994 lid = get_lid(p_node, port, mon_node);
995 cl_plock_release(pm->lock);
997 OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 4C0C: "
998 "Failed to clear counters for %s (0x%"
999 PRIx64 ") port %d; failed to get lid\n",
1000 mon_node->name, mon_node->guid, port);
1004 remote_qp = get_qp(NULL, port);
1006 mad_context.perfmgr_context.node_guid = mon_node->guid;
1007 mad_context.perfmgr_context.port = port;
1008 mad_context.perfmgr_context.mad_method = IB_MAD_METHOD_SET;
1009 /* clear port counters */
1011 osm_perfmgr_send_pc_mad(pm, lid, remote_qp, port,
1012 IB_MAD_METHOD_SET, &mad_context);
1013 if (status != IB_SUCCESS)
1014 OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 4C11: "
1015 "Failed to send clear counters MAD for %s (0x%"
1016 PRIx64 ") port %d\n",
1017 mon_node->name, mon_node->guid, port);
1019 perfmgr_db_clear_prev_dc(pm->db, mon_node->guid, port);
1023 OSM_LOG_EXIT(pm->log);
1026 /**********************************************************************
1027 * Check values for logging of errors
1028 **********************************************************************/
1030 osm_perfmgr_log_events(osm_perfmgr_t * pm, __monitored_node_t *mon_node, uint8_t port,
1031 perfmgr_db_err_reading_t * reading)
1033 perfmgr_db_err_reading_t prev_read;
1034 time_t time_diff = 0;
1035 perfmgr_db_err_t err =
1036 perfmgr_db_get_prev_err(pm->db, mon_node->guid, port, &prev_read);
1038 if (err != PERFMGR_EVENT_DB_SUCCESS) {
1039 OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Failed to find previous "
1040 "reading for %s (0x%" PRIx64 ") port %u\n",
1041 mon_node->name, mon_node->guid, port);
1044 time_diff = (reading->time - prev_read.time);
1046 /* FIXME these events should be defineable by the user in a config
1047 * file somewhere. */
1048 if (reading->symbol_err_cnt > prev_read.symbol_err_cnt)
1049 OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C0D: "
1050 "Found %" PRIu64 " Symbol errors in %lu sec on %s (0x%"
1051 PRIx64 ") port %u\n",
1052 (reading->symbol_err_cnt - prev_read.symbol_err_cnt),
1053 time_diff, mon_node->name, mon_node->guid, port);
1055 if (reading->rcv_err > prev_read.rcv_err)
1056 OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C0E: "
1058 " Receive errors in %lu sec on %s (0x%" PRIx64
1059 ") port %u\n", (reading->rcv_err - prev_read.rcv_err),
1060 time_diff, mon_node->name, mon_node->guid, port);
1062 if (reading->xmit_discards > prev_read.xmit_discards)
1063 OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C0F: "
1064 "Found %" PRIu64 " Xmit Discards in %lu sec on %s (0x%"
1065 PRIx64 ") port %u\n",
1066 (reading->xmit_discards - prev_read.xmit_discards),
1067 time_diff, mon_node->name, mon_node->guid, port);
1070 /**********************************************************************
1071 * The dispatcher uses a thread pool which will call this function when
1072 * we have a thread available to process our mad received from the wire.
1073 **********************************************************************/
1074 static void osm_pc_rcv_process(void *context, void *data)
1076 osm_perfmgr_t *const pm = (osm_perfmgr_t *) context;
1077 osm_madw_t *p_madw = (osm_madw_t *) data;
1078 osm_madw_context_t *mad_context = &(p_madw->context);
1079 ib_port_counters_t *wire_read =
1080 (ib_port_counters_t *) & (osm_madw_get_perfmgt_mad_ptr(p_madw)->
1082 ib_mad_t *p_mad = osm_madw_get_mad_ptr(p_madw);
1083 uint64_t node_guid = mad_context->perfmgr_context.node_guid;
1084 uint8_t port = mad_context->perfmgr_context.port;
1085 perfmgr_db_err_reading_t err_reading;
1086 perfmgr_db_data_cnt_reading_t data_reading;
1087 cl_map_item_t *p_node;
1088 __monitored_node_t *p_mon_node;
1090 OSM_LOG_ENTER(pm->log);
1092 /* go ahead and get the monitored node struct to have the printable
1093 * name if needed in messages
1095 if ((p_node = cl_qmap_get(&(pm->monitored_map), node_guid)) ==
1096 cl_qmap_end(&(pm->monitored_map))) {
1097 OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C12: GUID 0x%016"
1098 PRIx64 " not found in monitored map\n",
1102 p_mon_node = (__monitored_node_t *) p_node;
1104 OSM_LOG(pm->log, OSM_LOG_VERBOSE,
1105 "Processing received MAD status 0x%x context 0x%"
1106 PRIx64 " port %u\n", p_mad->status, node_guid, port);
1108 /* Response could also be redirection (IBM eHCA PMA does this) */
1109 if (p_mad->attr_id == IB_MAD_ATTR_CLASS_PORT_INFO) {
1110 char gid_str[INET6_ADDRSTRLEN];
1111 ib_class_port_info_t *cpi =
1112 (ib_class_port_info_t *) &
1113 (osm_madw_get_perfmgt_mad_ptr(p_madw)->data);
1114 ib_api_status_t status;
1116 OSM_LOG(pm->log, OSM_LOG_VERBOSE,
1117 "Redirection to LID %u GID %s QP 0x%x received\n",
1118 cl_ntoh16(cpi->redir_lid),
1119 inet_ntop(AF_INET6, cpi->redir_gid.raw, gid_str,
1121 cl_ntoh32(cpi->redir_qp));
1123 /* LID or GID redirection ? */
1124 /* For GID redirection, need to get PathRecord from SA */
1125 if (cpi->redir_lid == 0) {
1126 OSM_LOG(pm->log, OSM_LOG_VERBOSE,
1127 "GID redirection not currently implemented!\n");
1131 if (!pm->subn->opt.perfmgr_redir) {
1132 OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C16: "
1133 "redirection requested but disabled\n");
1137 /* LID redirection support (easier than GID redirection) */
1138 cl_plock_acquire(pm->lock);
1139 /* Now, validate port number */
1140 if (port > p_mon_node->redir_tbl_size) {
1141 cl_plock_release(pm->lock);
1142 OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C13: "
1143 "Invalid port num %d for GUID 0x%016"
1144 PRIx64 " num ports %d\n", port, node_guid,
1145 p_mon_node->redir_tbl_size);
1148 p_mon_node->redir_port[port].redir_lid = cpi->redir_lid;
1149 p_mon_node->redir_port[port].redir_qp = cpi->redir_qp;
1150 cl_plock_release(pm->lock);
1152 /* Finally, reissue the query to the redirected location */
1154 osm_perfmgr_send_pc_mad(pm, cpi->redir_lid, cpi->redir_qp,
1156 mad_context->perfmgr_context.
1157 mad_method, mad_context);
1158 if (status != IB_SUCCESS)
1159 OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C14: "
1160 "Failed to send redirected MAD with method 0x%x for node 0x%"
1161 PRIx64 " port %d\n",
1162 mad_context->perfmgr_context.mad_method,
1167 CL_ASSERT(p_mad->attr_id == IB_MAD_ATTR_PORT_CNTRS);
1169 perfmgr_db_fill_err_read(wire_read, &err_reading);
1170 /* FIXME separate query for extended counters if they are supported
1173 perfmgr_db_fill_data_cnt_read_pc(wire_read, &data_reading);
1175 /* detect an out of band clear on the port */
1176 if (mad_context->perfmgr_context.mad_method != IB_MAD_METHOD_SET)
1177 osm_perfmgr_check_oob_clear(pm, p_mon_node, port,
1178 &err_reading, &data_reading);
1180 /* log any critical events from this reading */
1181 osm_perfmgr_log_events(pm, p_mon_node, port, &err_reading);
1183 if (mad_context->perfmgr_context.mad_method == IB_MAD_METHOD_GET) {
1184 perfmgr_db_add_err_reading(pm->db, node_guid, port,
1186 perfmgr_db_add_dc_reading(pm->db, node_guid, port,
1189 perfmgr_db_clear_prev_err(pm->db, node_guid, port);
1190 perfmgr_db_clear_prev_dc(pm->db, node_guid, port);
1193 osm_perfmgr_check_overflow(pm, p_mon_node, port, wire_read);
1195 #if ENABLE_OSM_PERF_MGR_PROFILE
1197 struct timeval proc_time;
1198 gettimeofday(&proc_time, NULL);
1199 diff_time(&(p_madw->context.perfmgr_context.query_start),
1200 &proc_time, &proc_time);
1201 update_mad_stats(&proc_time);
1206 osm_mad_pool_put(pm->mad_pool, p_madw);
1208 OSM_LOG_EXIT(pm->log);
1211 /**********************************************************************
1212 * Initialize the PerfMgr object
1213 **********************************************************************/
1215 osm_perfmgr_init(osm_perfmgr_t * const pm, osm_opensm_t *osm,
1216 const osm_subn_opt_t * const p_opt)
1218 ib_api_status_t status = IB_SUCCESS;
1220 OSM_LOG_ENTER(&osm->log);
1222 OSM_LOG(&osm->log, OSM_LOG_VERBOSE, "Initializing PerfMgr\n");
1224 memset(pm, 0, sizeof(*pm));
1226 cl_event_construct(&pm->sig_sweep);
1227 cl_event_init(&pm->sig_sweep, FALSE);
1228 pm->subn = &osm->subn;
1230 pm->log = &osm->log;
1231 pm->mad_pool = &osm->mad_pool;
1232 pm->vendor = osm->p_vendor;
1233 pm->trans_id = OSM_PERFMGR_INITIAL_TID_VALUE;
1234 pm->lock = &osm->lock;
1236 p_opt->perfmgr ? PERFMGR_STATE_ENABLED : PERFMGR_STATE_DISABLE;
1237 pm->sweep_time_s = p_opt->perfmgr_sweep_time_s;
1238 pm->max_outstanding_queries = p_opt->perfmgr_max_outstanding_queries;
1241 status = cl_timer_init(&pm->sweep_timer, perfmgr_sweep, pm);
1242 if (status != IB_SUCCESS)
1245 pm->db = perfmgr_db_construct(pm);
1247 pm->state = PERFMGR_STATE_NO_DB;
1251 pm->pc_disp_h = cl_disp_register(&osm->disp, OSM_MSG_MAD_PORT_COUNTERS,
1252 osm_pc_rcv_process, pm);
1253 if (pm->pc_disp_h == CL_DISP_INVALID_HANDLE)
1256 __init_monitored_nodes(pm);
1258 cl_timer_start(&pm->sweep_timer, pm->sweep_time_s * 1000);
1261 OSM_LOG_EXIT(pm->log);
1265 /**********************************************************************
1266 * Clear the counters from the db
1267 **********************************************************************/
1268 void osm_perfmgr_clear_counters(osm_perfmgr_t * pm)
1271 * FIXME todo issue clear on the fabric?
1273 perfmgr_db_clear_counters(pm->db);
1274 osm_log(pm->log, OSM_LOG_INFO, "PerfMgr counters cleared\n");
1277 /*******************************************************************
1278 * Have the DB dump its information to the file specified
1279 *******************************************************************/
1280 void osm_perfmgr_dump_counters(osm_perfmgr_t * pm, perfmgr_db_dump_t dump_type)
1284 if (pm->subn->opt.event_db_dump_file)
1285 file_name = pm->subn->opt.event_db_dump_file;
1287 snprintf(path, sizeof(path), "%s/%s",
1288 pm->subn->opt.dump_files_dir,
1289 OSM_PERFMGR_DEFAULT_DUMP_FILE);
1292 if (perfmgr_db_dump(pm->db, file_name, dump_type) != 0)
1293 OSM_LOG(pm->log, OSM_LOG_ERROR, "Failed to dump file %s : %s",
1294 file_name, strerror(errno));
1297 /*******************************************************************
1298 * Have the DB print its information to the fp specified
1299 *******************************************************************/
1301 osm_perfmgr_print_counters(osm_perfmgr_t *pm, char *nodename, FILE *fp)
1303 uint64_t guid = strtoull(nodename, NULL, 0);
1304 if (guid == 0 && errno == EINVAL) {
1305 perfmgr_db_print_by_name(pm->db, nodename, fp);
1307 perfmgr_db_print_by_guid(pm->db, guid, fp);
1311 #endif /* ENABLE_OSM_PERF_MGR */