]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/ofed/opensm/opensm/osm_node_info_rcv.c
MFV r348573: 9993 zil writes can get delayed in zio pipeline
[FreeBSD/FreeBSD.git] / contrib / ofed / opensm / opensm / osm_node_info_rcv.c
1 /*
2  * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
3  * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved.
4  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5  * Copyright (c) 2009 HNR Consulting. All rights reserved.
6  *
7  * This software is available to you under a choice of one of two
8  * licenses.  You may choose to be licensed under the terms of the GNU
9  * General Public License (GPL) Version 2, available from the file
10  * COPYING in the main directory of this source tree, or the
11  * OpenIB.org BSD license below:
12  *
13  *     Redistribution and use in source and binary forms, with or
14  *     without modification, are permitted provided that the following
15  *     conditions are met:
16  *
17  *      - Redistributions of source code must retain the above
18  *        copyright notice, this list of conditions and the following
19  *        disclaimer.
20  *
21  *      - Redistributions in binary form must reproduce the above
22  *        copyright notice, this list of conditions and the following
23  *        disclaimer in the documentation and/or other materials
24  *        provided with the distribution.
25  *
26  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
30  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33  * SOFTWARE.
34  *
35  */
36
37 /*
38  * Abstract:
39  *    Implementation of osm_ni_rcv_t.
40  * This object represents the NodeInfo Receiver object.
41  * This object is part of the opensm family of objects.
42  */
43
44 #if HAVE_CONFIG_H
45 #  include <config.h>
46 #endif                          /* HAVE_CONFIG_H */
47
48 #include <stdlib.h>
49 #include <string.h>
50 #include <iba/ib_types.h>
51 #include <complib/cl_qmap.h>
52 #include <complib/cl_passivelock.h>
53 #include <complib/cl_debug.h>
54 #include <opensm/osm_file_ids.h>
55 #define FILE_ID OSM_FILE_NODE_INFO_RCV_C
56 #include <opensm/osm_madw.h>
57 #include <opensm/osm_log.h>
58 #include <opensm/osm_node.h>
59 #include <opensm/osm_subnet.h>
60 #include <opensm/osm_router.h>
61 #include <opensm/osm_mad_pool.h>
62 #include <opensm/osm_helper.h>
63 #include <opensm/osm_msgdef.h>
64 #include <opensm/osm_opensm.h>
65 #include <opensm/osm_ucast_mgr.h>
66 #include <opensm/osm_db_pack.h>
67
68 static void report_duplicated_guid(IN osm_sm_t * sm, osm_physp_t * p_physp,
69                                    osm_node_t * p_neighbor_node,
70                                    const uint8_t port_num)
71 {
72         osm_physp_t *p_old, *p_new;
73         osm_dr_path_t path;
74
75         p_old = p_physp->p_remote_physp;
76         p_new = osm_node_get_physp_ptr(p_neighbor_node, port_num);
77
78         OSM_LOG(sm->p_log, OSM_LOG_SYS | OSM_LOG_ERROR, "ERR 0D01: "
79                 "Found duplicated node GUID.\n"
80                 "Node 0x%" PRIx64 " port %u is reachable from remote node "
81                 "0x%" PRIx64 " port %u and remote node 0x%" PRIx64 " port %u.\n"
82                 "Paths are:\n",
83                 cl_ntoh64(p_physp->p_node->node_info.node_guid),
84                 p_physp->port_num,
85                 p_old ? cl_ntoh64(p_old->p_node->node_info.node_guid) : 0,
86                 p_old ? p_old->port_num : 0,
87                 p_new ? cl_ntoh64(p_new->p_node->node_info.node_guid) : 0,
88                 p_new ? p_new->port_num : 0);
89
90         osm_dump_dr_path_v2(sm->p_log, osm_physp_get_dr_path_ptr(p_physp),
91                             FILE_ID, OSM_LOG_ERROR);
92
93         path = *osm_physp_get_dr_path_ptr(p_new);
94         if (osm_dr_path_extend(&path, port_num))
95                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D05: "
96                         "DR path with hop count %d couldn't be extended\n",
97                         path.hop_count);
98         osm_dump_dr_path_v2(sm->p_log, &path, FILE_ID, OSM_LOG_ERROR);
99 }
100
101 static void requery_dup_node_info(IN osm_sm_t * sm, osm_physp_t * p_physp,
102                                   unsigned count)
103 {
104         osm_madw_context_t context;
105         osm_dr_path_t path;
106         cl_status_t status;
107
108         if (!p_physp->p_remote_physp) {
109                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D0D: "
110                         "DR path couldn't be extended due to NULL remote physp\n");
111                 return;
112         }
113
114         path = *osm_physp_get_dr_path_ptr(p_physp->p_remote_physp);
115         if (osm_dr_path_extend(&path, p_physp->p_remote_physp->port_num)) {
116                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D08: "
117                         "DR path with hop count %d couldn't be extended\n",
118                         path.hop_count);
119                 return;
120         }
121
122         context.ni_context.node_guid =
123             p_physp->p_remote_physp->p_node->node_info.port_guid;
124         context.ni_context.port_num = p_physp->p_remote_physp->port_num;
125         context.ni_context.dup_node_guid = p_physp->p_node->node_info.node_guid;
126         context.ni_context.dup_port_num = p_physp->port_num;
127         context.ni_context.dup_count = count;
128
129         status = osm_req_get(sm, &path, IB_MAD_ATTR_NODE_INFO, 0,
130                              TRUE, 0, CL_DISP_MSGID_NONE, &context);
131
132         if (status != IB_SUCCESS)
133                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D02: "
134                         "Failure initiating NodeInfo request (%s)\n",
135                         ib_get_err_str(status));
136 }
137
138 /**********************************************************************
139  The plock must be held before calling this function.
140 **********************************************************************/
141 static void ni_rcv_set_links(IN osm_sm_t * sm, osm_node_t * p_node,
142                              const uint8_t port_num,
143                              const osm_ni_context_t * p_ni_context)
144 {
145         osm_node_t *p_neighbor_node;
146         osm_physp_t *p_physp, *p_remote_physp;
147
148         OSM_LOG_ENTER(sm->p_log);
149
150         /*
151            A special case exists in which the node we're trying to
152            link is our own node.  In this case, the guid value in
153            the ni_context will be zero.
154          */
155         if (p_ni_context->node_guid == 0) {
156                 OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
157                         "Nothing to link for our own node 0x%" PRIx64 "\n",
158                         cl_ntoh64(osm_node_get_node_guid(p_node)));
159                 goto _exit;
160         }
161
162         p_neighbor_node = osm_get_node_by_guid(sm->p_subn,
163                                                p_ni_context->node_guid);
164         if (PF(!p_neighbor_node)) {
165                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D10: "
166                         "Unexpected removal of neighbor node 0x%" PRIx64 "\n",
167                         cl_ntoh64(p_ni_context->node_guid));
168                 goto _exit;
169         }
170
171         /* When setting the link, ports on both
172            sides of the link should be initialized */
173         CL_ASSERT(osm_node_link_has_valid_ports(p_node, port_num,
174                                                 p_neighbor_node,
175                                                 p_ni_context->port_num));
176
177         if (osm_node_link_exists(p_node, port_num,
178                                  p_neighbor_node, p_ni_context->port_num)) {
179                 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "Link already exists\n");
180                 goto _exit;
181         }
182
183         p_physp = osm_node_get_physp_ptr(p_node, port_num);
184         if (!p_physp) {
185                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR OD0E: "
186                         "Failed to find physp for port %d of Node GUID 0x%"
187                         PRIx64 "\n", port_num,
188                         cl_ntoh64(osm_node_get_node_guid(p_node)));
189                 goto _exit;
190         }
191
192         /*
193          * If the link went UP, after we already discovered it, we shouldn't
194          * set the link between the ports and resweep.
195          */
196         if (osm_physp_get_port_state(p_physp) == IB_LINK_DOWN &&
197             p_node->physp_discovered[port_num]) {
198                 /* Link down on another side. Don't create a link*/
199                 p_node->physp_discovered[port_num] = 0;
200                 sm->p_subn->force_heavy_sweep = TRUE;
201                 goto _exit;
202         }
203
204         if (osm_node_has_any_link(p_node, port_num) &&
205             sm->p_subn->force_heavy_sweep == FALSE &&
206             (!p_ni_context->dup_count ||
207              (p_ni_context->dup_node_guid == osm_node_get_node_guid(p_node) &&
208               p_ni_context->dup_port_num == port_num))) {
209                 /*
210                    Uh oh...
211                    This could be reconnected ports, but also duplicated GUID
212                    (2 nodes have the same guid) or a 12x link with lane reversal
213                    that is not configured correctly.
214                    We will try to recover by querying NodeInfo again.
215                    In order to catch even fast port moving to new location(s)
216                    and back we will count up to 5.
217                    Some crazy reconnections (newly created switch loop right
218                    before targeted CA) will not be catched this way. So in worst
219                    case - report GUID duplication and request new discovery.
220                    When switch node is targeted NodeInfo querying will be done
221                    in opposite order, this is much stronger check, unfortunately
222                    it is impossible with CAs.
223                  */
224                 p_physp = osm_node_get_physp_ptr(p_node, port_num);
225                 if (!p_physp) {
226                         OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR OD0F: "
227                                 "Failed to find physp for port %d of Node GUID 0x%"
228                                 PRIx64 "\n", port_num,
229                                 cl_ntoh64(osm_node_get_node_guid(p_node)));
230                         goto _exit;
231                 }
232
233                 if (p_ni_context->dup_count > 5) {
234                         report_duplicated_guid(sm, p_physp, p_neighbor_node,
235                                                p_ni_context->port_num);
236                         sm->p_subn->force_heavy_sweep = TRUE;
237                 } else if (p_node->sw)
238                         requery_dup_node_info(sm, p_physp->p_remote_physp,
239                                               p_ni_context->dup_count + 1);
240                 else
241                         requery_dup_node_info(sm, p_physp,
242                                               p_ni_context->dup_count + 1);
243         }
244
245         /*
246            When there are only two nodes with exact same guids (connected back
247            to back) - the previous check for duplicated guid will not catch
248            them. But the link will be from the port to itself...
249            Enhanced Port 0 is an exception to this
250          */
251         if (osm_node_get_node_guid(p_node) == p_ni_context->node_guid &&
252             port_num == p_ni_context->port_num &&
253             port_num != 0 && cl_qmap_count(&sm->p_subn->sw_guid_tbl) == 0) {
254                 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
255                         "Duplicate GUID found by link from a port to itself:"
256                         "node 0x%" PRIx64 ", port number %u\n",
257                         cl_ntoh64(osm_node_get_node_guid(p_node)), port_num);
258                 p_physp = osm_node_get_physp_ptr(p_node, port_num);
259                 if (!p_physp) {
260                         OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR OD1D: "
261                                 "Failed to find physp for port %d of Node GUID 0x%"
262                                 PRIx64 "\n", port_num,
263                                 cl_ntoh64(osm_node_get_node_guid(p_node)));
264                         goto _exit;
265                 }
266
267                 osm_dump_dr_path_v2(sm->p_log, osm_physp_get_dr_path_ptr(p_physp),
268                                     FILE_ID, OSM_LOG_VERBOSE);
269
270                 if (sm->p_subn->opt.exit_on_fatal == TRUE) {
271                         osm_log_v2(sm->p_log, OSM_LOG_SYS, FILE_ID,
272                                    "Errors on subnet. Duplicate GUID found "
273                                    "by link from a port to itself. "
274                                    "See verbose opensm.log for more details\n");
275                         exit(1);
276                 }
277         }
278
279         OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
280                 "Creating new link between:\n\t\t\t\tnode 0x%" PRIx64
281                 ", port number %u and\n\t\t\t\tnode 0x%" PRIx64
282                 ", port number %u\n",
283                 cl_ntoh64(osm_node_get_node_guid(p_node)), port_num,
284                 cl_ntoh64(p_ni_context->node_guid), p_ni_context->port_num);
285
286         if (sm->ucast_mgr.cache_valid)
287                 osm_ucast_cache_check_new_link(&sm->ucast_mgr, p_node, port_num,
288                                                p_neighbor_node,
289                                                p_ni_context->port_num);
290
291         p_physp = osm_node_get_physp_ptr(p_node, port_num);
292         p_remote_physp = osm_node_get_physp_ptr(p_neighbor_node,
293                                                 p_ni_context->port_num);
294         if (!p_physp || !p_remote_physp)
295                 goto _exit;
296
297         osm_node_link(p_node, port_num, p_neighbor_node, p_ni_context->port_num);
298
299         osm_db_neighbor_set(sm->p_subn->p_neighbor,
300                             cl_ntoh64(osm_physp_get_port_guid(p_physp)),
301                             port_num,
302                             cl_ntoh64(osm_physp_get_port_guid(p_remote_physp)),
303                             p_ni_context->port_num);
304         osm_db_neighbor_set(sm->p_subn->p_neighbor,
305                             cl_ntoh64(osm_physp_get_port_guid(p_remote_physp)),
306                             p_ni_context->port_num,
307                             cl_ntoh64(osm_physp_get_port_guid(p_physp)),
308                             port_num);
309
310 _exit:
311         OSM_LOG_EXIT(sm->p_log);
312 }
313
314 static void ni_rcv_get_port_info(IN osm_sm_t * sm, IN osm_node_t * node,
315                                  IN const osm_madw_t * madw)
316 {
317         osm_madw_context_t context;
318         osm_physp_t *physp;
319         ib_node_info_t *ni;
320         unsigned port;
321         ib_api_status_t status;
322         int mlnx_epi_supported = 0;
323
324         ni = ib_smp_get_payload_ptr(osm_madw_get_smp_ptr(madw));
325
326         port = ib_node_info_get_local_port_num(ni);
327
328         if (sm->p_subn->opt.fdr10)
329                 mlnx_epi_supported = is_mlnx_ext_port_info_supported(
330                                                 ib_node_info_get_vendor_id(ni),
331                                                 ni->device_id);
332
333         physp = osm_node_get_physp_ptr(node, port);
334         if (!physp) {
335                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR OD1E: "
336                         "Failed to find physp for port %d of Node GUID 0x%"
337                         PRIx64 "\n", port,
338                         cl_ntoh64(osm_node_get_node_guid(node)));
339                 return;
340         }
341
342         context.pi_context.node_guid = osm_node_get_node_guid(node);
343         context.pi_context.port_guid = osm_physp_get_port_guid(physp);
344         context.pi_context.set_method = FALSE;
345         context.pi_context.light_sweep = FALSE;
346         context.pi_context.active_transition = FALSE;
347         context.pi_context.client_rereg = FALSE;
348
349         status = osm_req_get(sm, osm_physp_get_dr_path_ptr(physp),
350                              IB_MAD_ATTR_PORT_INFO, cl_hton32(port),
351                              TRUE, 0, CL_DISP_MSGID_NONE, &context);
352         if (status != IB_SUCCESS)
353                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR OD02: "
354                         "Failure initiating PortInfo request (%s)\n",
355                         ib_get_err_str(status));
356         if (mlnx_epi_supported) {
357                 status = osm_req_get(sm,
358                                      osm_physp_get_dr_path_ptr(physp),
359                                      IB_MAD_ATTR_MLNX_EXTENDED_PORT_INFO,
360                                      cl_hton32(port),
361                                      TRUE, 0, CL_DISP_MSGID_NONE, &context);
362                 if (status != IB_SUCCESS)
363                         OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D0B: "
364                                 "Failure initiating MLNX ExtPortInfo request (%s)\n",
365                                 ib_get_err_str(status));
366         }
367 }
368
369 /**********************************************************************
370  The plock must be held before calling this function.
371 **********************************************************************/
372 void osm_req_get_node_desc(IN osm_sm_t * sm, osm_physp_t * p_physp)
373 {
374         ib_api_status_t status = IB_SUCCESS;
375         osm_madw_context_t context;
376
377         OSM_LOG_ENTER(sm->p_log);
378
379         context.nd_context.node_guid =
380             osm_node_get_node_guid(osm_physp_get_node_ptr(p_physp));
381
382         status = osm_req_get(sm, osm_physp_get_dr_path_ptr(p_physp),
383                              IB_MAD_ATTR_NODE_DESC, 0, TRUE, 0,
384                              CL_DISP_MSGID_NONE, &context);
385         if (status != IB_SUCCESS)
386                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D03: "
387                         "Failure initiating NodeDescription request (%s)\n",
388                         ib_get_err_str(status));
389
390         OSM_LOG_EXIT(sm->p_log);
391 }
392
393 /**********************************************************************
394  The plock must be held before calling this function.
395 **********************************************************************/
396 static void ni_rcv_get_node_desc(IN osm_sm_t * sm, IN osm_node_t * p_node,
397                                  IN const osm_madw_t * p_madw)
398 {
399         ib_node_info_t *p_ni;
400         ib_smp_t *p_smp;
401         uint8_t port_num;
402         osm_physp_t *p_physp = NULL;
403
404         OSM_LOG_ENTER(sm->p_log);
405
406         p_smp = osm_madw_get_smp_ptr(p_madw);
407         p_ni = ib_smp_get_payload_ptr(p_smp);
408         port_num = ib_node_info_get_local_port_num(p_ni);
409
410         /*
411            Request PortInfo & NodeDescription attributes for the port
412            that responded to the NodeInfo attribute.
413            Because this is a channel adapter or router, we are
414            not allowed to request PortInfo for the other ports.
415            Set the context union properly, so the recipient
416            knows which node & port are relevant.
417          */
418         p_physp = osm_node_get_physp_ptr(p_node, port_num);
419         if (!p_physp) {
420                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR OD1F: "
421                         "Failed to find physp for port %d of Node GUID 0x%"
422                         PRIx64 "\n", port_num,
423                         cl_ntoh64(osm_node_get_node_guid(p_node)));
424                 return;
425         }
426
427         osm_req_get_node_desc(sm, p_physp);
428
429         OSM_LOG_EXIT(sm->p_log);
430 }
431
432 /**********************************************************************
433  The plock must be held before calling this function.
434 **********************************************************************/
435 static void ni_rcv_process_new_ca_or_router(IN osm_sm_t * sm,
436                                             IN osm_node_t * p_node,
437                                             IN const osm_madw_t * p_madw)
438 {
439         OSM_LOG_ENTER(sm->p_log);
440
441         ni_rcv_get_port_info(sm, p_node, p_madw);
442
443         /*
444            A node guid of 0 is the corner case that indicates
445            we discovered our own node.  Initialize the subnet
446            object with the SM's own port guid.
447          */
448         if (osm_madw_get_ni_context_ptr(p_madw)->node_guid == 0)
449                 sm->p_subn->sm_port_guid = p_node->node_info.port_guid;
450
451         OSM_LOG_EXIT(sm->p_log);
452 }
453
454 /**********************************************************************
455  The plock must be held before calling this function.
456 **********************************************************************/
457 static void ni_rcv_process_existing_ca_or_router(IN osm_sm_t * sm,
458                                                  IN osm_node_t * p_node,
459                                                  IN const osm_madw_t * p_madw)
460 {
461         ib_node_info_t *p_ni;
462         ib_smp_t *p_smp;
463         osm_port_t *p_port;
464         osm_port_t *p_port_check;
465         uint8_t port_num;
466         osm_dr_path_t *p_dr_path;
467         osm_alias_guid_t *p_alias_guid, *p_alias_guid_check;
468
469         OSM_LOG_ENTER(sm->p_log);
470
471         p_smp = osm_madw_get_smp_ptr(p_madw);
472         p_ni = ib_smp_get_payload_ptr(p_smp);
473         port_num = ib_node_info_get_local_port_num(p_ni);
474
475         /*
476            Determine if we have encountered this node through a
477            previously undiscovered port.  If so, build the new
478            port object.
479          */
480         p_port = osm_get_port_by_guid(sm->p_subn, p_ni->port_guid);
481         if (!p_port) {
482                 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
483                         "Creating new port object with GUID 0x%" PRIx64 "\n",
484                         cl_ntoh64(p_ni->port_guid));
485
486                 osm_node_init_physp(p_node, port_num, p_madw);
487
488                 p_port = osm_port_new(p_ni, p_node);
489                 if (PF(p_port == NULL)) {
490                         OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D04: "
491                                 "Unable to create new port object\n");
492                         goto Exit;
493                 }
494
495                 /*
496                    Add the new port object to the database.
497                  */
498                 p_port_check =
499                     (osm_port_t *) cl_qmap_insert(&sm->p_subn->port_guid_tbl,
500                                                   p_ni->port_guid,
501                                                   &p_port->map_item);
502                 if (PF(p_port_check != p_port)) {
503                         /*
504                            We should never be here!
505                            Somehow, this port GUID already exists in the table.
506                          */
507                         OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D12: "
508                                 "Port 0x%" PRIx64 " already in the database!\n",
509                                 cl_ntoh64(p_ni->port_guid));
510
511                         osm_port_delete(&p_port);
512                         goto Exit;
513                 }
514
515                 p_alias_guid = osm_alias_guid_new(p_ni->port_guid,
516                                                   p_port);
517                 if (PF(!p_alias_guid)) {
518                         OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D11: "
519                                 "alias guid memory allocation failed"
520                                 " for port GUID 0x%" PRIx64 "\n",
521                                 cl_ntoh64(p_ni->port_guid));
522                         goto alias_done;
523                 }
524
525                 /* insert into alias guid table */
526                 p_alias_guid_check =
527                         (osm_alias_guid_t *) cl_qmap_insert(&sm->p_subn->alias_port_guid_tbl,
528                                                             p_alias_guid->alias_guid,
529                                                             &p_alias_guid->map_item);
530                 if (p_alias_guid_check != p_alias_guid) {
531                         /* alias GUID is a duplicate */
532                         OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D13: "
533                                 "Duplicate alias port GUID 0x%" PRIx64 "\n",
534                                 cl_ntoh64(p_ni->port_guid));
535                         osm_alias_guid_delete(&p_alias_guid);
536                         osm_port_delete(&p_port);
537                         goto Exit;
538                 }
539
540 alias_done:
541                 /* If we are a master, then this means the port is new on the subnet.
542                    Mark it as new - need to send trap 64 for these ports.
543                    The condition that we are master is true, since if we are in discovering
544                    state (meaning we woke up from standby or we are just initializing),
545                    then these ports may be new to us, but are not new on the subnet.
546                    If we are master, then the subnet as we know it is the updated one,
547                    and any new ports we encounter should cause trap 64. C14-72.1.1 */
548                 if (sm->p_subn->sm_state == IB_SMINFO_STATE_MASTER)
549                         p_port->is_new = 1;
550
551         } else {
552                 osm_physp_t *p_physp = osm_node_get_physp_ptr(p_node, port_num);
553
554                 if (PF(p_physp == NULL)) {
555                         OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D1C: "
556                                 "No physical port found for node GUID 0x%"
557                                 PRIx64 " port %u. Might be duplicate port GUID\n",
558                                 cl_ntoh64(p_node->node_info.node_guid),
559                                 port_num);
560                         goto Exit;
561                 }
562
563                 /*
564                    Update the DR Path to the port,
565                    in case the old one is no longer available.
566                  */
567                 p_dr_path = osm_physp_get_dr_path_ptr(p_physp);
568
569                 osm_dr_path_init(p_dr_path, p_smp->hop_count,
570                                  p_smp->initial_path);
571         }
572
573         ni_rcv_get_port_info(sm, p_node, p_madw);
574
575 Exit:
576         OSM_LOG_EXIT(sm->p_log);
577 }
578
579 static void ni_rcv_process_switch(IN osm_sm_t * sm, IN osm_node_t * p_node,
580                                   IN const osm_madw_t * p_madw)
581 {
582         ib_api_status_t status = IB_SUCCESS;
583         osm_physp_t *p_physp;
584         osm_madw_context_t context;
585         osm_dr_path_t *path;
586         ib_smp_t *p_smp;
587
588         OSM_LOG_ENTER(sm->p_log);
589
590         p_smp = osm_madw_get_smp_ptr(p_madw);
591
592         p_physp = osm_node_get_physp_ptr(p_node, 0);
593         /* update DR path of already initialized switch port 0 */
594         path = osm_physp_get_dr_path_ptr(p_physp);
595         osm_dr_path_init(path, p_smp->hop_count, p_smp->initial_path);
596
597         context.si_context.node_guid = osm_node_get_node_guid(p_node);
598         context.si_context.set_method = FALSE;
599         context.si_context.light_sweep = FALSE;
600         context.si_context.lft_top_change = FALSE;
601
602         /* Request a SwitchInfo attribute */
603         status = osm_req_get(sm, path, IB_MAD_ATTR_SWITCH_INFO, 0, TRUE, 0,
604                              CL_DISP_MSGID_NONE, &context);
605         if (status != IB_SUCCESS)
606                 /* continue despite error */
607                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D06: "
608                         "Failure initiating SwitchInfo request (%s)\n",
609                         ib_get_err_str(status));
610
611         OSM_LOG_EXIT(sm->p_log);
612 }
613
614 /**********************************************************************
615  The plock must be held before calling this function.
616 **********************************************************************/
617 static void ni_rcv_process_existing_switch(IN osm_sm_t * sm,
618                                            IN osm_node_t * p_node,
619                                            IN const osm_madw_t * p_madw)
620 {
621         OSM_LOG_ENTER(sm->p_log);
622
623         /*
624            If this switch has already been probed during this sweep,
625            then don't bother reprobing it.
626          */
627         if (p_node->discovery_count == 1)
628                 ni_rcv_process_switch(sm, p_node, p_madw);
629
630         OSM_LOG_EXIT(sm->p_log);
631 }
632
633 /**********************************************************************
634  The plock must be held before calling this function.
635 **********************************************************************/
636 static void ni_rcv_process_new_switch(IN osm_sm_t * sm, IN osm_node_t * p_node,
637                                       IN const osm_madw_t * p_madw)
638 {
639         OSM_LOG_ENTER(sm->p_log);
640
641         ni_rcv_process_switch(sm, p_node, p_madw);
642
643         /*
644            A node guid of 0 is the corner case that indicates
645            we discovered our own node.  Initialize the subnet
646            object with the SM's own port guid.
647          */
648         if (osm_madw_get_ni_context_ptr(p_madw)->node_guid == 0)
649                 sm->p_subn->sm_port_guid = p_node->node_info.port_guid;
650
651         OSM_LOG_EXIT(sm->p_log);
652 }
653
654 /**********************************************************************
655  The plock must NOT be held before calling this function.
656 **********************************************************************/
657 static void ni_rcv_process_new(IN osm_sm_t * sm, IN const osm_madw_t * p_madw)
658 {
659         osm_node_t *p_node;
660         osm_node_t *p_node_check;
661         osm_port_t *p_port;
662         osm_port_t *p_port_check;
663         osm_router_t *p_rtr = NULL;
664         osm_router_t *p_rtr_check;
665         cl_qmap_t *p_rtr_guid_tbl;
666         ib_node_info_t *p_ni;
667         ib_smp_t *p_smp;
668         osm_ni_context_t *p_ni_context;
669         osm_alias_guid_t *p_alias_guid, *p_alias_guid_check;
670         uint8_t port_num;
671
672         OSM_LOG_ENTER(sm->p_log);
673
674         p_smp = osm_madw_get_smp_ptr(p_madw);
675         p_ni = ib_smp_get_payload_ptr(p_smp);
676         p_ni_context = osm_madw_get_ni_context_ptr(p_madw);
677         port_num = ib_node_info_get_local_port_num(p_ni);
678
679         osm_dump_smp_dr_path_v2(sm->p_log, p_smp, FILE_ID, OSM_LOG_VERBOSE);
680
681         OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
682                 "Discovered new %s node,"
683                 "\n\t\t\t\tGUID 0x%" PRIx64 ", TID 0x%" PRIx64 "\n",
684                 ib_get_node_type_str(p_ni->node_type),
685                 cl_ntoh64(p_ni->node_guid), cl_ntoh64(p_smp->trans_id));
686
687         if (PF(port_num > p_ni->num_ports)) {
688                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D0A: "
689                         "New %s node GUID 0x%" PRIx64 "is non-compliant and "
690                         "is being ignored since the "
691                         "local port num %u > num ports %u\n",
692                         ib_get_node_type_str(p_ni->node_type),
693                         cl_ntoh64(p_ni->node_guid), port_num,
694                         p_ni->num_ports);
695                 goto Exit;
696         }
697
698         p_node = osm_node_new(p_madw);
699         if (PF(p_node == NULL)) {
700                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D07: "
701                         "Unable to create new node object\n");
702                 goto Exit;
703         }
704
705         /*
706            Create a new port object to represent this node's physical
707            ports in the port table.
708          */
709         p_port = osm_port_new(p_ni, p_node);
710         if (PF(p_port == NULL)) {
711                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D14: "
712                         "Unable to create new port object\n");
713                 osm_node_delete(&p_node);
714                 goto Exit;
715         }
716
717         /*
718            Add the new port object to the database.
719          */
720         p_port_check =
721             (osm_port_t *) cl_qmap_insert(&sm->p_subn->port_guid_tbl,
722                                           p_ni->port_guid, &p_port->map_item);
723         if (PF(p_port_check != p_port)) {
724                 /*
725                    We should never be here!
726                    Somehow, this port GUID already exists in the table.
727                  */
728                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D15: "
729                         "Duplicate Port GUID 0x%" PRIx64
730                         "! Found by the two directed routes:\n",
731                         cl_ntoh64(p_ni->port_guid));
732                 osm_dump_dr_path_v2(sm->p_log,
733                                     osm_physp_get_dr_path_ptr(p_port->p_physp),
734                                     FILE_ID, OSM_LOG_ERROR);
735                 osm_dump_dr_path_v2(sm->p_log,
736                                     osm_physp_get_dr_path_ptr(p_port_check->
737                                                            p_physp),
738                                     FILE_ID, OSM_LOG_ERROR);
739                 osm_port_delete(&p_port);
740                 osm_node_delete(&p_node);
741                 goto Exit;
742         }
743
744         p_alias_guid = osm_alias_guid_new(p_ni->port_guid,
745                                           p_port);
746         if (PF(!p_alias_guid)) {
747                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D18: "
748                         "alias guid memory allocation failed"
749                         " for port GUID 0x%" PRIx64 "\n",
750                         cl_ntoh64(p_ni->port_guid));
751                 goto alias_done2;
752         }
753
754         /* insert into alias guid table */
755         p_alias_guid_check =
756                 (osm_alias_guid_t *) cl_qmap_insert(&sm->p_subn->alias_port_guid_tbl,
757                                                     p_alias_guid->alias_guid,
758                                                     &p_alias_guid->map_item);
759         if (p_alias_guid_check != p_alias_guid) {
760                 /* alias GUID is a duplicate */
761                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D19: "
762                         "Duplicate alias port GUID 0x%" PRIx64 "\n",
763                         cl_ntoh64(p_ni->port_guid));
764                 osm_alias_guid_delete(&p_alias_guid);
765         }
766
767 alias_done2:
768         /* If we are a master, then this means the port is new on the subnet.
769            Mark it as new - need to send trap 64 on these ports.
770            The condition that we are master is true, since if we are in discovering
771            state (meaning we woke up from standby or we are just initializing),
772            then these ports may be new to us, but are not new on the subnet.
773            If we are master, then the subnet as we know it is the updated one,
774            and any new ports we encounter should cause trap 64. C14-72.1.1 */
775         if (sm->p_subn->sm_state == IB_SMINFO_STATE_MASTER)
776                 p_port->is_new = 1;
777
778         /* If there were RouterInfo or other router attribute,
779            this would be elsewhere */
780         if (p_ni->node_type == IB_NODE_TYPE_ROUTER) {
781                 if (PF((p_rtr = osm_router_new(p_port)) == NULL))
782                         OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D1A: "
783                                 "Unable to create new router object\n");
784                 else {
785                         p_rtr_guid_tbl = &sm->p_subn->rtr_guid_tbl;
786                         p_rtr_check =
787                             (osm_router_t *) cl_qmap_insert(p_rtr_guid_tbl,
788                                                             p_ni->port_guid,
789                                                             &p_rtr->map_item);
790                         if (PF(p_rtr_check != p_rtr))
791                                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D1B: "
792                                         "Unable to add port GUID:0x%016" PRIx64
793                                         " to router table\n",
794                                         cl_ntoh64(p_ni->port_guid));
795                 }
796         }
797
798         p_node_check =
799             (osm_node_t *) cl_qmap_insert(&sm->p_subn->node_guid_tbl,
800                                           p_ni->node_guid, &p_node->map_item);
801         if (PF(p_node_check != p_node)) {
802                 /*
803                    This node must have been inserted by another thread.
804                    This is unexpected, but is not an error.
805                    We can simply clean-up, since the other thread will
806                    see this processing through to completion.
807                  */
808                 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
809                         "Discovery race detected at node 0x%" PRIx64 "\n",
810                         cl_ntoh64(p_ni->node_guid));
811                 osm_node_delete(&p_node);
812                 p_node = p_node_check;
813                 ni_rcv_set_links(sm, p_node, port_num, p_ni_context);
814                 goto Exit;
815         } else
816                 ni_rcv_set_links(sm, p_node, port_num, p_ni_context);
817
818         p_node->discovery_count++;
819         ni_rcv_get_node_desc(sm, p_node, p_madw);
820
821         switch (p_ni->node_type) {
822         case IB_NODE_TYPE_CA:
823         case IB_NODE_TYPE_ROUTER:
824                 ni_rcv_process_new_ca_or_router(sm, p_node, p_madw);
825                 break;
826         case IB_NODE_TYPE_SWITCH:
827                 ni_rcv_process_new_switch(sm, p_node, p_madw);
828                 break;
829         default:
830                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D16: "
831                         "Unknown node type %u with GUID 0x%" PRIx64 "\n",
832                         p_ni->node_type, cl_ntoh64(p_ni->node_guid));
833                 break;
834         }
835
836 Exit:
837         OSM_LOG_EXIT(sm->p_log);
838 }
839
840 /**********************************************************************
841  The plock must be held before calling this function.
842 **********************************************************************/
843 static void ni_rcv_process_existing(IN osm_sm_t * sm, IN osm_node_t * p_node,
844                                     IN const osm_madw_t * p_madw)
845 {
846         ib_node_info_t *p_ni;
847         ib_smp_t *p_smp;
848         osm_ni_context_t *p_ni_context;
849         uint8_t port_num;
850
851         OSM_LOG_ENTER(sm->p_log);
852
853         p_smp = osm_madw_get_smp_ptr(p_madw);
854         p_ni = ib_smp_get_payload_ptr(p_smp);
855         p_ni_context = osm_madw_get_ni_context_ptr(p_madw);
856         port_num = ib_node_info_get_local_port_num(p_ni);
857
858         OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
859                 "Rediscovered %s node 0x%" PRIx64 " TID 0x%" PRIx64
860                 ", discovered %u times already\n",
861                 ib_get_node_type_str(p_ni->node_type),
862                 cl_ntoh64(p_ni->node_guid),
863                 cl_ntoh64(p_smp->trans_id), p_node->discovery_count);
864
865         if (PF(port_num > p_ni->num_ports)) {
866                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D0C: "
867                         "Existing %s node GUID 0x%" PRIx64 "is non-compliant "
868                         "and is being ignored since the "
869                         "local port num %u > num ports %u\n",
870                         ib_get_node_type_str(p_ni->node_type),
871                         cl_ntoh64(p_ni->node_guid), port_num,
872                         p_ni->num_ports);
873                 goto Exit;
874         }
875
876         /*
877            If we haven't already encountered this existing node
878            on this particular sweep, then process further.
879          */
880         p_node->discovery_count++;
881
882         switch (p_ni->node_type) {
883         case IB_NODE_TYPE_CA:
884         case IB_NODE_TYPE_ROUTER:
885                 ni_rcv_process_existing_ca_or_router(sm, p_node, p_madw);
886                 break;
887
888         case IB_NODE_TYPE_SWITCH:
889                 ni_rcv_process_existing_switch(sm, p_node, p_madw);
890                 break;
891
892         default:
893                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D09: "
894                         "Unknown node type %u with GUID 0x%" PRIx64 "\n",
895                         p_ni->node_type, cl_ntoh64(p_ni->node_guid));
896                 break;
897         }
898
899         if ( p_ni->sys_guid != p_node->node_info.sys_guid) {
900                 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "Updated SysImageGUID: 0x%"
901                         PRIx64 " for node 0x%" PRIx64 "\n",
902                         cl_ntoh64(p_ni->sys_guid),
903                         cl_ntoh64(p_ni->node_guid));
904         }
905         ni_rcv_set_links(sm, p_node, port_num, p_ni_context);
906         p_node->node_info = *p_ni;
907
908 Exit:
909         OSM_LOG_EXIT(sm->p_log);
910 }
911
912 void osm_ni_rcv_process(IN void *context, IN void *data)
913 {
914         osm_sm_t *sm = context;
915         osm_madw_t *p_madw = data;
916         ib_node_info_t *p_ni;
917         ib_smp_t *p_smp;
918         osm_node_t *p_node;
919
920         CL_ASSERT(sm);
921
922         OSM_LOG_ENTER(sm->p_log);
923
924         CL_ASSERT(p_madw);
925
926         p_smp = osm_madw_get_smp_ptr(p_madw);
927         p_ni = ib_smp_get_payload_ptr(p_smp);
928
929         CL_ASSERT(p_smp->attr_id == IB_MAD_ATTR_NODE_INFO);
930
931         if (PF(p_ni->node_guid == 0)) {
932                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D16: "
933                         "Got Zero Node GUID! Found on the directed route:\n");
934                 osm_dump_smp_dr_path_v2(sm->p_log, p_smp, FILE_ID, OSM_LOG_ERROR);
935                 goto Exit;
936         }
937
938         if (PF(p_ni->port_guid == 0)) {
939                 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D17: "
940                         "Got Zero Port GUID! Found on the directed route:\n");
941                 osm_dump_smp_dr_path_v2(sm->p_log, p_smp, FILE_ID, OSM_LOG_ERROR);
942                 goto Exit;
943         }
944
945         if (ib_smp_get_status(p_smp)) {
946                 OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
947                         "MAD status 0x%x received\n",
948                         cl_ntoh16(ib_smp_get_status(p_smp)));
949                 goto Exit;
950         }
951
952         /*
953            Determine if this node has already been discovered,
954            and process accordingly.
955            During processing of this node, hold the shared lock.
956          */
957
958         CL_PLOCK_EXCL_ACQUIRE(sm->p_lock);
959         p_node = osm_get_node_by_guid(sm->p_subn, p_ni->node_guid);
960
961         osm_dump_node_info_v2(sm->p_log, p_ni, FILE_ID, OSM_LOG_DEBUG);
962
963         if (!p_node)
964                 ni_rcv_process_new(sm, p_madw);
965         else
966                 ni_rcv_process_existing(sm, p_node, p_madw);
967
968         CL_PLOCK_RELEASE(sm->p_lock);
969
970 Exit:
971         OSM_LOG_EXIT(sm->p_log);
972 }