2 * Copyright (c) 2006-2016 Chelsio, Inc. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37 #include <sys/errno.h>
38 #include <infiniband/opcode.h>
40 #include "cxgb4-abi.h"
42 static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq)
46 PDBG("%s wq %p cq %p sw_cidx %u sw_pidx %u\n", __func__,
47 wq, cq, cq->sw_cidx, cq->sw_pidx);
48 memset(&cqe, 0, sizeof(cqe));
49 cqe.header = htobe32(V_CQE_STATUS(T4_ERR_SWFLUSH) |
50 V_CQE_OPCODE(FW_RI_SEND) |
53 V_CQE_QPID(wq->sq.qid));
54 cqe.bits_type_ts = htobe64(V_CQE_GENBIT((u64)cq->gen));
55 cq->sw_queue[cq->sw_pidx] = cqe;
59 int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count)
62 int in_use = wq->rq.in_use - count;
65 PDBG("%s wq %p cq %p rq.in_use %u skip count %u\n", __func__,
66 wq, cq, wq->rq.in_use, count);
68 insert_recv_cqe(wq, cq);
74 static void insert_sq_cqe(struct t4_wq *wq, struct t4_cq *cq,
75 struct t4_swsqe *swcqe)
79 PDBG("%s wq %p cq %p sw_cidx %u sw_pidx %u\n", __func__,
80 wq, cq, cq->sw_cidx, cq->sw_pidx);
81 memset(&cqe, 0, sizeof(cqe));
82 cqe.header = htobe32(V_CQE_STATUS(T4_ERR_SWFLUSH) |
83 V_CQE_OPCODE(swcqe->opcode) |
86 V_CQE_QPID(wq->sq.qid));
87 CQE_WRID_SQ_IDX(&cqe) = swcqe->idx;
88 cqe.bits_type_ts = htobe64(V_CQE_GENBIT((u64)cq->gen));
89 cq->sw_queue[cq->sw_pidx] = cqe;
93 static void advance_oldest_read(struct t4_wq *wq);
95 void c4iw_flush_sq(struct c4iw_qp *qhp)
97 unsigned short flushed = 0;
98 struct t4_wq *wq = &qhp->wq;
99 struct c4iw_cq *chp = to_c4iw_cq(qhp->ibv_qp.send_cq);
100 struct t4_cq *cq = &chp->cq;
102 struct t4_swsqe *swsqe;
104 if (wq->sq.flush_cidx == -1)
105 wq->sq.flush_cidx = wq->sq.cidx;
106 idx = wq->sq.flush_cidx;
107 BUG_ON(idx >= wq->sq.size);
108 while (idx != wq->sq.pidx) {
109 swsqe = &wq->sq.sw_sq[idx];
110 BUG_ON(swsqe->flushed);
112 insert_sq_cqe(wq, cq, swsqe);
113 if (wq->sq.oldest_read == swsqe) {
114 BUG_ON(swsqe->opcode != FW_RI_READ_REQ);
115 advance_oldest_read(wq);
118 if (++idx == wq->sq.size)
121 wq->sq.flush_cidx += flushed;
122 if (wq->sq.flush_cidx >= wq->sq.size)
123 wq->sq.flush_cidx -= wq->sq.size;
126 static void flush_completed_wrs(struct t4_wq *wq, struct t4_cq *cq)
128 struct t4_swsqe *swsqe;
131 if (wq->sq.flush_cidx == -1)
132 wq->sq.flush_cidx = wq->sq.cidx;
133 cidx = wq->sq.flush_cidx;
134 BUG_ON(cidx >= wq->sq.size);
136 while (cidx != wq->sq.pidx) {
137 swsqe = &wq->sq.sw_sq[cidx];
138 if (!swsqe->signaled) {
139 if (++cidx == wq->sq.size)
141 } else if (swsqe->complete) {
143 BUG_ON(swsqe->flushed);
146 * Insert this completed cqe into the swcq.
148 PDBG("%s moving cqe into swcq sq idx %u cq idx %u\n",
149 __func__, cidx, cq->sw_pidx);
151 swsqe->cqe.header |= htobe32(V_CQE_SWCQE(1));
152 cq->sw_queue[cq->sw_pidx] = swsqe->cqe;
155 if (++cidx == wq->sq.size)
157 wq->sq.flush_cidx = cidx;
163 static void create_read_req_cqe(struct t4_wq *wq, struct t4_cqe *hw_cqe,
164 struct t4_cqe *read_cqe)
166 read_cqe->u.scqe.cidx = wq->sq.oldest_read->idx;
167 read_cqe->len = be32toh(wq->sq.oldest_read->read_len);
168 read_cqe->header = htobe32(V_CQE_QPID(CQE_QPID(hw_cqe)) |
169 V_CQE_SWCQE(SW_CQE(hw_cqe)) |
170 V_CQE_OPCODE(FW_RI_READ_REQ) |
172 read_cqe->bits_type_ts = hw_cqe->bits_type_ts;
175 static void advance_oldest_read(struct t4_wq *wq)
178 u32 rptr = wq->sq.oldest_read - wq->sq.sw_sq + 1;
180 if (rptr == wq->sq.size)
182 while (rptr != wq->sq.pidx) {
183 wq->sq.oldest_read = &wq->sq.sw_sq[rptr];
185 if (wq->sq.oldest_read->opcode == FW_RI_READ_REQ)
187 if (++rptr == wq->sq.size)
190 wq->sq.oldest_read = NULL;
194 * Move all CQEs from the HWCQ into the SWCQ.
195 * Deal with out-of-order and/or completions that complete
196 * prior unsignalled WRs.
198 void c4iw_flush_hw_cq(struct c4iw_cq *chp)
200 struct t4_cqe *hw_cqe, *swcqe, read_cqe;
202 struct t4_swsqe *swsqe;
205 PDBG("%s cqid 0x%x\n", __func__, chp->cq.cqid);
206 ret = t4_next_hw_cqe(&chp->cq, &hw_cqe);
209 * This logic is similar to poll_cq(), but not quite the same
210 * unfortunately. Need to move pertinent HW CQEs to the SW CQ but
211 * also do any translation magic that poll_cq() normally does.
214 qhp = get_qhp(chp->rhp, CQE_QPID(hw_cqe));
217 * drop CQEs with no associated QP
222 if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE)
225 if (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP) {
228 * If we have reached here because of async
229 * event or other error, and have egress error
232 if (CQE_TYPE(hw_cqe) == 1) {
233 syslog(LOG_CRIT, "%s: got egress error in \
234 read-response, dropping!\n", __func__);
239 * drop peer2peer RTR reads.
241 if (CQE_WRID_STAG(hw_cqe) == 1)
245 * Eat completions for unsignaled read WRs.
247 if (!qhp->wq.sq.oldest_read->signaled) {
248 advance_oldest_read(&qhp->wq);
253 * Don't write to the HWCQ, create a new read req CQE
254 * in local memory and move it into the swcq.
256 create_read_req_cqe(&qhp->wq, hw_cqe, &read_cqe);
258 advance_oldest_read(&qhp->wq);
261 /* if its a SQ completion, then do the magic to move all the
262 * unsignaled and now in-order completions into the swcq.
264 if (SQ_TYPE(hw_cqe)) {
265 int idx = CQE_WRID_SQ_IDX(hw_cqe);
267 BUG_ON(idx >= qhp->wq.sq.size);
268 swsqe = &qhp->wq.sq.sw_sq[idx];
269 swsqe->cqe = *hw_cqe;
271 flush_completed_wrs(&qhp->wq, &chp->cq);
273 swcqe = &chp->cq.sw_queue[chp->cq.sw_pidx];
275 swcqe->header |= htobe32(V_CQE_SWCQE(1));
276 t4_swcq_produce(&chp->cq);
279 t4_hwcq_consume(&chp->cq);
280 ret = t4_next_hw_cqe(&chp->cq, &hw_cqe);
284 static int cqe_completes_wr(struct t4_cqe *cqe, struct t4_wq *wq)
286 if (CQE_OPCODE(cqe) == FW_RI_TERMINATE)
289 if ((CQE_OPCODE(cqe) == FW_RI_RDMA_WRITE) && RQ_TYPE(cqe))
292 if ((CQE_OPCODE(cqe) == FW_RI_READ_RESP) && SQ_TYPE(cqe))
295 if (CQE_SEND_OPCODE(cqe) && RQ_TYPE(cqe) && t4_rq_empty(wq))
300 void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count)
307 BUG_ON(ptr >= cq->size);
308 while (ptr != cq->sw_pidx) {
309 cqe = &cq->sw_queue[ptr];
310 if (RQ_TYPE(cqe) && (CQE_OPCODE(cqe) != FW_RI_READ_RESP) &&
311 (CQE_QPID(cqe) == wq->sq.qid) && cqe_completes_wr(cqe, wq))
313 if (++ptr == cq->size)
316 PDBG("%s cq %p count %d\n", __func__, cq, *count);
319 static void dump_cqe(void *arg)
322 syslog(LOG_NOTICE, "cxgb4 err cqe %016llx %016llx %016llx %016llx\n",
323 (long long)be64toh(p[0]),
324 (long long)be64toh(p[1]),
325 (long long)be64toh(p[2]),
326 (long long)be64toh(p[3]));
333 * check the validity of the first CQE,
334 * supply the wq assicated with the qpid.
336 * credit: cq credit to return to sge.
337 * cqe_flushed: 1 iff the CQE is flushed.
338 * cqe: copy of the polled CQE.
342 * -EAGAIN CQE skipped, try again.
343 * -EOVERFLOW CQ overflow detected.
345 static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
346 u8 *cqe_flushed, u64 *cookie, u32 *credit)
349 struct t4_cqe *hw_cqe, read_cqe;
354 ret = t4_next_cqe(cq, &hw_cqe);
358 PDBG("%s CQE OVF %u qpid 0x%0x genbit %u type %u status 0x%0x"
359 " opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n",
360 __func__, CQE_OVFBIT(hw_cqe), CQE_QPID(hw_cqe),
361 CQE_GENBIT(hw_cqe), CQE_TYPE(hw_cqe), CQE_STATUS(hw_cqe),
362 CQE_OPCODE(hw_cqe), CQE_LEN(hw_cqe), CQE_WRID_HI(hw_cqe),
363 CQE_WRID_LOW(hw_cqe));
366 * skip cqe's not affiliated with a QP.
374 * Gotta tweak READ completions:
375 * 1) the cqe doesn't contain the sq_wptr from the wr.
376 * 2) opcode not reflected from the wr.
377 * 3) read_len not reflected from the wr.
378 * 4) T4 HW (for now) inserts target read response failures which
379 * need to be skipped.
381 if (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP) {
384 * If we have reached here because of async
385 * event or other error, and have egress error
388 if (CQE_TYPE(hw_cqe) == 1) {
389 syslog(LOG_CRIT, "%s: got egress error in \
390 read-response, dropping!\n", __func__);
391 if (CQE_STATUS(hw_cqe))
392 t4_set_wq_in_error(wq);
398 * If this is an unsolicited read response, then the read
399 * was generated by the kernel driver as part of peer-2-peer
400 * connection setup, or a target read response failure.
401 * So skip the completion.
403 if (CQE_WRID_STAG(hw_cqe) == 1) {
404 if (CQE_STATUS(hw_cqe))
405 t4_set_wq_in_error(wq);
411 * Eat completions for unsignaled read WRs.
413 if (!wq->sq.oldest_read->signaled) {
414 advance_oldest_read(wq);
420 * Don't write to the HWCQ, so create a new read req CQE
423 create_read_req_cqe(wq, hw_cqe, &read_cqe);
425 advance_oldest_read(wq);
428 if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE) {
433 if (CQE_STATUS(hw_cqe) || t4_wq_in_error(wq)) {
434 *cqe_flushed = (CQE_STATUS(hw_cqe) == T4_ERR_SWFLUSH);
437 if (!*cqe_flushed && CQE_STATUS(hw_cqe))
440 BUG_ON((cqe_flushed == 0) && !SW_CQE(hw_cqe));
447 if (RQ_TYPE(hw_cqe)) {
450 * HW only validates 4 bits of MSN. So we must validate that
451 * the MSN in the SEND is the next expected MSN. If its not,
452 * then we complete this with T4_ERR_MSN and mark the wq in
456 if (t4_rq_empty(wq)) {
457 t4_set_wq_in_error(wq);
461 if (unlikely((CQE_WRID_MSN(hw_cqe) != (wq->rq.msn)))) {
462 t4_set_wq_in_error(wq);
463 hw_cqe->header |= htobe32(V_CQE_STATUS(T4_ERR_MSN));
470 * If we get here its a send completion.
472 * Handle out of order completion. These get stuffed
473 * in the SW SQ. Then the SW SQ is walked to move any
474 * now in-order completions into the SW CQ. This handles
476 * 1) reaping unsignaled WRs when the first subsequent
477 * signaled WR is completed.
478 * 2) out of order read completions.
480 if (!SW_CQE(hw_cqe) && (CQE_WRID_SQ_IDX(hw_cqe) != wq->sq.cidx)) {
481 struct t4_swsqe *swsqe;
482 int idx = CQE_WRID_SQ_IDX(hw_cqe);
484 PDBG("%s out of order completion going in sw_sq at idx %u\n",
486 BUG_ON(idx >= wq->sq.size);
487 swsqe = &wq->sq.sw_sq[idx];
488 swsqe->cqe = *hw_cqe;
498 * Reap the associated WR(s) that are freed up with this
501 if (SQ_TYPE(hw_cqe)) {
502 int idx = CQE_WRID_SQ_IDX(hw_cqe);
503 BUG_ON(idx >= wq->sq.size);
506 * Account for any unsignaled completions completed by
507 * this signaled completion. In this case, cidx points
508 * to the first unsignaled one, and idx points to the
509 * signaled one. So adjust in_use based on this delta.
510 * if this is not completing any unsigned wrs, then the
511 * delta will be 0. Handle wrapping also!
513 if (idx < wq->sq.cidx)
514 wq->sq.in_use -= wq->sq.size + idx - wq->sq.cidx;
516 wq->sq.in_use -= idx - wq->sq.cidx;
517 BUG_ON(wq->sq.in_use <= 0 || wq->sq.in_use >= wq->sq.size);
519 wq->sq.cidx = (u16)idx;
520 PDBG("%s completing sq idx %u\n", __func__, wq->sq.cidx);
521 *cookie = wq->sq.sw_sq[wq->sq.cidx].wr_id;
524 PDBG("%s completing rq idx %u\n", __func__, wq->rq.cidx);
525 BUG_ON(wq->rq.cidx >= wq->rq.size);
526 *cookie = wq->rq.sw_rq[wq->rq.cidx].wr_id;
527 BUG_ON(t4_rq_empty(wq));
534 * Flush any completed cqes that are now in-order.
536 flush_completed_wrs(wq, cq);
539 if (SW_CQE(hw_cqe)) {
540 PDBG("%s cq %p cqid 0x%x skip sw cqe cidx %u\n",
541 __func__, cq, cq->cqid, cq->sw_cidx);
544 PDBG("%s cq %p cqid 0x%x skip hw cqe cidx %u\n",
545 __func__, cq, cq->cqid, cq->cidx);
552 * Get one cq entry from c4iw and map it to openib.
557 * -EAGAIN caller must try again
558 * any other -errno fatal error
560 static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ibv_wc *wc)
562 struct c4iw_qp *qhp = NULL;
563 struct t4_cqe cqe, *rd_cqe;
570 ret = t4_next_cqe(&chp->cq, &rd_cqe);
573 #ifdef STALL_DETECTION
574 if (ret == -ENODATA && stall_to && !chp->dumped) {
577 gettimeofday(&t, NULL);
578 if ((t.tv_sec - chp->time.tv_sec) > stall_to) {
587 #ifdef STALL_DETECTION
588 gettimeofday(&chp->time, NULL);
591 qhp = get_qhp(chp->rhp, CQE_QPID(rd_cqe));
595 pthread_spin_lock(&qhp->lock);
598 ret = poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, &credit);
604 wc->qp_num = qhp->wq.sq.qid;
605 wc->vendor_err = CQE_STATUS(&cqe);
608 PDBG("%s qpid 0x%x type %d opcode %d status 0x%x wrid hi 0x%x "
609 "lo 0x%x cookie 0x%llx\n", __func__,
610 CQE_QPID(&cqe), CQE_TYPE(&cqe),
611 CQE_OPCODE(&cqe), CQE_STATUS(&cqe), CQE_WRID_HI(&cqe),
612 CQE_WRID_LOW(&cqe), (unsigned long long)cookie);
614 if (CQE_TYPE(&cqe) == 0) {
615 if (!CQE_STATUS(&cqe))
616 wc->byte_len = CQE_LEN(&cqe);
619 wc->opcode = IBV_WC_RECV;
621 switch (CQE_OPCODE(&cqe)) {
622 case FW_RI_RDMA_WRITE:
623 wc->opcode = IBV_WC_RDMA_WRITE;
626 wc->opcode = IBV_WC_RDMA_READ;
627 wc->byte_len = CQE_LEN(&cqe);
630 case FW_RI_SEND_WITH_SE:
631 case FW_RI_SEND_WITH_INV:
632 case FW_RI_SEND_WITH_SE_INV:
633 wc->opcode = IBV_WC_SEND;
636 wc->opcode = IBV_WC_BIND_MW;
639 PDBG("Unexpected opcode %d "
640 "in the CQE received for QPID=0x%0x\n",
641 CQE_OPCODE(&cqe), CQE_QPID(&cqe));
648 wc->status = IBV_WC_WR_FLUSH_ERR;
651 switch (CQE_STATUS(&cqe)) {
653 wc->status = IBV_WC_SUCCESS;
656 wc->status = IBV_WC_LOC_ACCESS_ERR;
659 wc->status = IBV_WC_LOC_PROT_ERR;
663 wc->status = IBV_WC_LOC_ACCESS_ERR;
666 wc->status = IBV_WC_GENERAL_ERR;
669 wc->status = IBV_WC_LOC_LEN_ERR;
671 case T4_ERR_INVALIDATE_SHARED_MR:
672 case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND:
673 wc->status = IBV_WC_MW_BIND_ERR;
677 case T4_ERR_PDU_LEN_ERR:
678 case T4_ERR_OUT_OF_RQE:
679 case T4_ERR_DDP_VERSION:
680 case T4_ERR_RDMA_VERSION:
681 case T4_ERR_DDP_QUEUE_NUM:
685 case T4_ERR_MSN_RANGE:
686 case T4_ERR_IRD_OVERFLOW:
688 case T4_ERR_INTERNAL_ERR:
689 wc->status = IBV_WC_FATAL_ERR;
692 wc->status = IBV_WC_WR_FLUSH_ERR;
695 PDBG("Unexpected cqe_status 0x%x for QPID=0x%0x\n",
696 CQE_STATUS(&cqe), CQE_QPID(&cqe));
697 wc->status = IBV_WC_FATAL_ERR;
700 if (wc->status && wc->status != IBV_WC_WR_FLUSH_ERR)
701 syslog(LOG_NOTICE, "cxgb4 app err cqid %u qpid %u "
702 "type %u opcode %u status 0x%x\n",
703 chp->cq.cqid, CQE_QPID(&cqe), CQE_TYPE(&cqe),
704 CQE_OPCODE(&cqe), CQE_STATUS(&cqe));
707 pthread_spin_unlock(&qhp->lock);
711 int c4iw_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc)
717 chp = to_c4iw_cq(ibcq);
719 if (t4_cq_in_error(&chp->cq)) {
720 t4_reset_cq_in_error(&chp->cq);
721 c4iw_flush_qps(chp->rhp);
725 return t4_cq_notempty(&chp->cq);
727 pthread_spin_lock(&chp->lock);
728 for (npolled = 0; npolled < num_entries; ++npolled) {
730 err = c4iw_poll_cq_one(chp, wc + npolled);
731 } while (err == -EAGAIN);
735 pthread_spin_unlock(&chp->lock);
736 return !err || err == -ENODATA ? npolled : err;
739 int c4iw_arm_cq(struct ibv_cq *ibcq, int solicited)
745 chp = to_c4iw_cq(ibcq);
746 pthread_spin_lock(&chp->lock);
747 ret = t4_arm_cq(&chp->cq, solicited);
748 pthread_spin_unlock(&chp->lock);