3 # Copyright (c) 2006 The Regents of the University of California.
4 # Copyright (c) 2006-2008 Voltaire, Inc. All rights reserved.
6 # Produced at Lawrence Livermore National Laboratory.
7 # Written by Ira Weiny <weiny2@llnl.gov>.
8 # Erez Strauss from Voltaire for help in the get_link_ends code.
10 # This software is available to you under a choice of one of two
11 # licenses. You may choose to be licensed under the terms of the GNU
12 # General Public License (GPL) Version 2, available from the file
13 # COPYING in the main directory of this source tree, or the
14 # OpenIB.org BSD license below:
16 # Redistribution and use in source and binary forms, with or
17 # without modification, are permitted provided that the following
20 # - Redistributions of source code must retain the above
21 # copyright notice, this list of conditions and the following
24 # - Redistributions in binary form must reproduce the above
25 # copyright notice, this list of conditions and the following
26 # disclaimer in the documentation and/or other materials
27 # provided with the distribution.
29 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
33 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
34 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
35 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41 %IBswcountlimits::cur_counts = ();
42 %IBswcountlimits::new_counts = ();
43 @IBswcountlimits::suppress_errors = ();
44 $IBswcountlimits::link_ends = undef;
45 $IBswcountlimits::pause_time = 10;
46 $IBswcountlimits::cache_dir = "/var/cache/infiniband-diags";
48 # all the PerfMgt counters
49 @IBswcountlimits::counters = (
50 "SymbolErrors", "LinkRecovers",
51 "LinkDowned", "RcvErrors",
52 "RcvRemotePhysErrors", "RcvSwRelayErrors",
53 "XmtDiscards", "XmtConstraintErrors",
54 "RcvConstraintErrors", "LinkIntegrityErrors",
55 "ExcBufOverrunErrors", "VL15Dropped",
60 # non-critical counters
61 %IBswcountlimits::error_counters = (
63 "No action is required except if counter is increasing along with LinkRecovers",
65 "If this is increasing along with SymbolErrors this may indicate a bad link, run ibswportwatch.pl on this port",
67 "Number of times the port has gone down (Usually for valid reasons)",
69 "This is a bad link, if the link is internal to a 288 try setting SDR, otherwise check the cable",
70 "RcvRemotePhysErrors",
71 "This indicates a problem ELSEWHERE in the fabric.",
73 "This is a symptom of congestion and may require tweaking either HOQ or switch lifetime values",
74 "XmtConstraintErrors",
75 "This is a result of bad partitioning, check partition configuration.",
76 "RcvConstraintErrors",
77 "This is a result of bad partitioning, check partition configuration.",
78 "LinkIntegrityErrors",
79 "May indicate a bad link, run ibswportwatch.pl on this port",
80 "ExcBufOverrunErrors",
81 "This is a flow control state machine error and can be caused by packets with physical errors",
83 "check with ibswportwatch.pl, if increasing in SMALL increments, OK",
85 "This counter can increase due to a valid network event"
90 my $print_action = $_[0];
93 COUNTER: foreach my $cnt (keys %IBswcountlimits::error_counters) {
94 if ($IBswcountlimits::cur_counts{$cnt} > 0) {
95 foreach my $sup_cnt (@IBswcountlimits::suppress_errors) {
96 if ("$cnt" eq $sup_cnt) { next COUNTER; }
98 print " [$cnt == $IBswcountlimits::cur_counts{$cnt}]";
99 if ("$print_action" eq "yes") {
103 " $cnt: $IBswcountlimits::error_counters{$cnt}\n"
110 print "\n Actions:\n$actions";
115 %IBswcountlimits::data_counters = (
117 "Total number of data octets, divided by 4, transmitted on all VLs from the port",
119 "Total number of data octets, divided by 4, received on all VLs to the port",
121 "Total number of packets, excluding link packets, transmitted on all VLs from the port",
123 "Total number of packets, excluding link packets, received on all VLs to the port"
126 sub check_data_counters
128 my $print_action = $_[0];
131 COUNTER: foreach my $cnt (keys %IBswcountlimits::data_counters) {
132 print " [$cnt == $IBswcountlimits::cur_counts{$cnt}]";
133 if ("$print_action" eq "yes") {
137 " $cnt: $IBswcountlimits::data_counters{$cnt}\n"
142 print "\n Descriptions:\n$actions";
148 COUNTER: foreach my $cnt (keys %IBswcountlimits::data_counters) {
149 my $cnt_per_second = calculate_rate(
150 $IBswcountlimits::cur_counts{$cnt},
151 $IBswcountlimits::new_counts{$cnt}
153 print " $cnt_per_second $cnt/second\n";
157 # =========================================================================
158 # Rate dependent counters
159 # calculate the count/sec
160 # calculate_rate old_count new_count
166 my $rate = ($new_val - $old_val) / $IBswcountlimits::pause_time;
169 %IBswcountlimits::rate_dep_thresholds = (
170 "SymbolErrors", 10, "LinkRecovers", 10,
171 "RcvErrors", 10, "LinkIntegrityErrors", 10,
175 sub check_counter_rates
177 foreach my $rate_count (keys %IBswcountlimits::rate_dep_thresholds) {
178 my $rate = calculate_rate(
179 $IBswcountlimits::cur_counts{$rate_count},
180 $IBswcountlimits::new_counts{$rate_count}
182 if ($rate > $IBswcountlimits::rate_dep_thresholds{$rate_count}) {
183 print "Detected excessive rate for $rate_count ($rate cnts/sec)\n";
184 } elsif ($rate > 0) {
185 print "Detected rate for $rate_count ($rate cnts/sec)\n";
190 # =========================================================================
195 foreach my $count (@IBswcountlimits::counters) {
196 $IBswcountlimits::cur_counts{$count} = 0;
200 # =========================================================================
206 foreach $count (keys %IBswcountlimits::critical) {
207 $total = $total + $IBswcountlimits::cur_counts{$count};
209 COUNTER: foreach $count (keys %IBswcountlimits::error_counters) {
210 foreach my $sup_cnt (@IBswcountlimits::suppress_errors) {
211 if ("$count" eq $sup_cnt) { next COUNTER; }
213 $total = $total + $IBswcountlimits::cur_counts{$count};
218 # =========================================================================
222 if (!(-d "$IBswcountlimits::cache_dir") &&
223 !mkdir($IBswcountlimits::cache_dir, 0700)) {
224 die "cannot create $IBswcountlimits::cache_dir: $!\n";
228 # =========================================================================
229 # get_cache_file(ca_name, ca_port)
237 "$IBswcountlimits::cache_dir/ibnetdiscover-$ca_name-$ca_port.topology");
240 # =========================================================================
241 # get_ca_name_port_param_string(ca_name, ca_port)
243 sub get_ca_name_port_param_string
248 if ("$ca_name" ne "") { $ca_name = "-C $ca_name"; }
249 if ("$ca_port" ne "") { $ca_port = "-P $ca_port"; }
251 return ("$ca_name $ca_port");
254 # =========================================================================
255 # generate_ibnetdiscover_topology(ca_name, ca_port)
257 sub generate_ibnetdiscover_topology
261 my $cache_file = get_cache_file($ca_name, $ca_port);
262 my $extra_params = get_ca_name_port_param_string($ca_name, $ca_port);
264 if (`ibnetdiscover -g $extra_params > $cache_file`) {
265 die "Execution of ibnetdiscover failed: $!\n";
269 # =========================================================================
270 # get_link_ends(regenerate_map, ca_name, ca_port)
274 my $regenerate_map = $_[0];
278 my $cache_file = get_cache_file($ca_name, $ca_port);
280 if ($regenerate_map || !(-f "$cache_file")) {
281 generate_ibnetdiscover_topology($ca_name, $ca_port);
283 open IBNET_TOPO, "<$cache_file"
284 or die "Failed to open ibnet topology: $!\n";
285 my $in_switch = "no";
293 while ($line = <IBNET_TOPO>) {
294 if ($line =~ /^Switch.*\"S-(.*)\"\s+#.*\"(.*)\".* lid (\d+).*/) {
300 if ($in_switch eq "yes") {
303 /^\[(\d+)\]\s+\"[HSR]-(.+)\"\[(\d+)\](\(.+\))?\s+#.*\"(.*)\"\.* lid (\d+).*/
309 my $rem_port_guid = $4;
313 loc_guid => "0x$guid",
314 loc_port => $loc_port,
317 loc_sw_lid => $loc_sw_lid,
318 rem_guid => "0x$rem_guid",
320 rem_port => $rem_port,
322 rem_desc => $rem_desc,
323 rem_port_guid => $rem_port_guid
327 /^\[(\d+)\]\[ext (\d+)\]\s+\"[HSR]-(.+)\"\[(\d+)\](\(.+\))?\s+#.*\"(.*)\"\.* lid (\d+).*/
331 my $loc_ext_port = $2;
334 my $rem_port_guid = $5;
338 loc_guid => "0x$guid",
339 loc_port => $loc_port,
340 loc_ext_port => $loc_ext_port,
342 loc_sw_lid => $loc_sw_lid,
343 rem_guid => "0x$rem_guid",
345 rem_port => $rem_port,
347 rem_desc => $rem_desc,
348 rem_port_guid => $rem_port_guid
352 /^\[(\d+)\]\s+\"[HSR]-(.+)\"\[(\d+)\]\[ext (\d+)\](\(.+\))?\s+#.*\"(.*)\"\.* lid (\d+).*/
358 my $rem_ext_port = $4;
359 my $rem_port_guid = $5;
363 loc_guid => "0x$guid",
364 loc_port => $loc_port,
367 loc_sw_lid => $loc_sw_lid,
368 rem_guid => "0x$rem_guid",
370 rem_port => $rem_port,
371 rem_ext_port => $rem_ext_port,
372 rem_desc => $rem_desc,
373 rem_port_guid => $rem_port_guid
377 /^\[(\d+)\]\[ext (\d+)\]\s+\"[HSR]-(.+)\"\[(\d+)\]\[ext (\d+)\](\(.+\))?\s+#.*\"(.*)\"\.* lid (\d+).*/
381 my $loc_ext_port = $2;
384 my $rem_ext_port = $5;
385 my $rem_port_guid = $6;
389 loc_guid => "0x$guid",
390 loc_port => $loc_port,
391 loc_ext_port => $loc_ext_port,
393 loc_sw_lid => $loc_sw_lid,
394 rem_guid => "0x$rem_guid",
396 rem_port => $rem_port,
397 rem_ext_port => $rem_ext_port,
398 rem_desc => $rem_desc,
399 rem_port_guid => $rem_port_guid
403 $rec->{rem_port_guid} =~ s/\((.*)\)/$1/;
404 $IBswcountlimits::link_ends{"0x$guid"}{$loc_port} = $rec;
408 if ($line =~ /^Ca.*/ || $line =~ /^Rt.*/) { $in_switch = "no"; }
413 # =========================================================================
414 # get_num_ports(switch_guid, ca_name, ca_port)
422 my $extra_params = get_ca_name_port_param_string($ca_name, $ca_port);
424 my $data = `smpquery $extra_params -G nodeinfo $guid` ||
425 die "'smpquery $extra_params -G nodeinfo $guid' faild\n";
426 my @lines = split("\n", $data);
427 my $pkt_lifetime = "";
428 foreach my $line (@lines) {
429 if ($line =~ /^NumPorts:\.+(.*)/) { $num_ports = $1; }
434 # =========================================================================
436 # The diags store the guids as strings. This converts the guid supplied
437 # to the correct string format.
438 # eg: 0x0008f10400411f56 == 0x8f10400411f56
445 $guid =~ tr/[A-F]/[a-f]/;
446 if ($guid =~ /0x(.*)/) {
447 $guid_str = sprintf("0x%016s", $1);
449 $guid_str = sprintf("0x%016s", $guid);
454 # =========================================================================
455 # convert_dr_to_guid(direct_route)
457 sub convert_dr_to_guid
461 my $data = `smpquery nodeinfo -D $_[0]` ||
462 die "'mpquery nodeinfo -D $_[0]' failed\n";
463 my @lines = split("\n", $data);
464 foreach my $line (@lines) {
465 if ($line =~ /^PortGuid:\.+(.*)/) { $guid = $1; }
467 return format_guid($guid);
470 # =========================================================================
471 # get_node_type(guid_or_direct_route)
476 my $query_arg = "smpquery nodeinfo ";
478 # assume arg is a guid if contains an x
479 $query_arg .= "-G " . $_[0];
481 # assume arg is a direct path
482 $query_arg .= "-D " . $_[0];
485 my $data = `$query_arg` ||
486 die "'$query_arg' failed\n";
487 my @lines = split("\n", $data);
488 foreach my $line (@lines) {
489 if ($line =~ /^NodeType:\.+(.*)/) { $type = $1; }
494 # =========================================================================
495 # is_switch(guid_or_direct_route)
499 my $node_type = &get_node_type($_[0]);
500 return ($node_type =~ /Switch/);