2 * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 #include <linux/tcp.h>
33 #include <asm/ioctls.h>
34 #include <linux/workqueue.h>
35 #include <linux/net.h>
36 #include <linux/socket.h>
37 #include <net/protocol.h>
38 #include <net/inet_common.h>
39 #include <rdma/rdma_cm.h>
40 #include <rdma/ib_verbs.h>
41 #include <rdma/ib_fmr_pool.h>
42 #include <rdma/ib_umem.h>
43 #include <net/tcp.h> /* for memcpy_toiovec */
45 #include <asm/uaccess.h>
46 #include <linux/delay.h>
49 static int sdp_post_srcavail(struct socket *sk, struct tx_srcavail_state *tx_sa)
51 struct sdp_sock *ssk = sdp_sk(sk);
54 struct page *payload_pg;
56 struct ib_umem_chunk *chunk;
61 BUG_ON(!tx_sa->fmr || !tx_sa->fmr->fmr->lkey);
63 BUG_ON(!tx_sa->umem->chunk_list.next);
65 chunk = list_entry(tx_sa->umem->chunk_list.next, struct ib_umem_chunk, list);
68 off = tx_sa->umem->offset;
69 len = tx_sa->umem->length;
71 tx_sa->bytes_sent = tx_sa->bytes_acked = 0;
73 mb = sdp_alloc_mb_srcavail(sk, len, tx_sa->fmr->fmr->lkey, off, 0);
77 sdp_dbg_data(sk, "sending SrcAvail\n");
79 TX_SRCAVAIL_STATE(mb) = tx_sa; /* tx_sa is hanged on the mb
80 * but continue to live after mb is freed */
83 /* must have payload inlined in SrcAvail packet in combined mode */
84 payload_len = MIN(tx_sa->umem->page_size - off, len);
85 payload_len = MIN(payload_len, ssk->xmit_size_goal - sizeof(struct sdp_srcah));
86 payload_pg = sg_page(&chunk->page_list[0]);
89 sdp_dbg_data(sk, "payload: off: 0x%x, pg: %p, len: 0x%x\n",
90 off, payload_pg, payload_len);
92 mb_fill_page_desc(mb, mb_shinfo(mb)->nr_frags,
93 payload_pg, off, payload_len);
95 mb->len += payload_len;
96 mb->data_len = payload_len;
97 mb->truesize += payload_len;
98 // sk->sk_wmem_queued += payload_len;
99 // sk->sk_forward_alloc -= payload_len;
101 mb_entail(sk, ssk, mb);
103 ssk->write_seq += payload_len;
104 SDP_SKB_CB(mb)->end_seq += payload_len;
106 tx_sa->bytes_sent = tx_sa->umem->length;
107 tx_sa->bytes_acked = payload_len;
109 /* TODO: pushing the mb into the tx_queue should be enough */
114 static int sdp_post_srcavail_cancel(struct socket *sk)
116 struct sdp_sock *ssk = sdp_sk(sk);
119 sdp_dbg_data(ssk->socket, "Posting srcavail cancel\n");
121 mb = sdp_alloc_mb_srcavail_cancel(sk, 0);
122 mb_entail(sk, ssk, mb);
124 sdp_post_sends(ssk, 0);
126 schedule_delayed_work(&ssk->srcavail_cancel_work,
127 SDP_SRCAVAIL_CANCEL_TIMEOUT);
132 void srcavail_cancel_timeout(struct work_struct *work)
134 struct sdp_sock *ssk =
135 container_of(work, struct sdp_sock, srcavail_cancel_work.work);
136 struct socket *sk = ssk->socket;
140 sdp_dbg_data(sk, "both SrcAvail and SrcAvailCancel timedout."
141 " closing connection\n");
142 sdp_set_error(sk, -ECONNRESET);
148 static int sdp_wait_rdmardcompl(struct sdp_sock *ssk, long *timeo_p,
151 struct socket *sk = ssk->socket;
154 long current_timeo = *timeo_p;
155 struct tx_srcavail_state *tx_sa = ssk->tx_sa;
158 sdp_dbg_data(sk, "sleep till RdmaRdCompl. timeo = %ld.\n", *timeo_p);
159 sdp_prf1(sk, NULL, "Going to sleep");
160 while (ssk->qp_active) {
161 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
163 if (unlikely(!*timeo_p)) {
165 tx_sa->abort_flags |= TX_SA_TIMEDOUT;
166 sdp_prf1(sk, NULL, "timeout");
167 SDPSTATS_COUNTER_INC(zcopy_tx_timeout);
171 else if (tx_sa->bytes_acked > tx_sa->bytes_sent) {
173 sdp_dbg_data(sk, "acked bytes > sent bytes\n");
174 tx_sa->abort_flags |= TX_SA_ERROR;
178 if (tx_sa->abort_flags & TX_SA_SENDSM) {
179 sdp_prf1(sk, NULL, "Aborting SrcAvail sending");
180 SDPSTATS_COUNTER_INC(zcopy_tx_aborted);
185 if (!ignore_signals) {
186 if (signal_pending(current)) {
188 sdp_prf1(sk, NULL, "signalled");
189 tx_sa->abort_flags |= TX_SA_INTRRUPTED;
193 if (ssk->rx_sa && (tx_sa->bytes_acked < tx_sa->bytes_sent)) {
194 sdp_dbg_data(sk, "Crossing SrcAvail - aborting this\n");
195 tx_sa->abort_flags |= TX_SA_CROSS_SEND;
196 SDPSTATS_COUNTER_INC(zcopy_cross_send);
202 posts_handler_put(ssk);
204 sk_wait_event(sk, ¤t_timeo,
205 tx_sa->abort_flags &&
207 (tx_sa->bytes_acked < tx_sa->bytes_sent) &&
209 sdp_dbg_data(ssk->socket, "woke up sleepers\n");
211 posts_handler_get(ssk);
213 if (tx_sa->bytes_acked == tx_sa->bytes_sent)
217 vm_wait -= current_timeo;
218 current_timeo = *timeo_p;
219 if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
220 (current_timeo -= vm_wait) < 0)
224 *timeo_p = current_timeo;
227 finish_wait(sk->sk_sleep, &wait);
229 sdp_dbg_data(sk, "Finished waiting - RdmaRdCompl: %d/%d bytes, flags: 0x%x\n",
230 tx_sa->bytes_acked, tx_sa->bytes_sent, tx_sa->abort_flags);
232 if (!ssk->qp_active) {
233 sdp_dbg(sk, "QP destroyed while waiting\n");
239 static void sdp_wait_rdma_wr_finished(struct sdp_sock *ssk)
241 struct socket *sk = ssk->socket;
242 long timeo = HZ * 5; /* Timeout for for RDMA read */
245 sdp_dbg_data(sk, "Sleep till RDMA wr finished.\n");
247 prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
249 if (!ssk->tx_ring.rdma_inflight->busy) {
250 sdp_dbg_data(sk, "got rdma cqe\n");
254 if (!ssk->qp_active) {
255 sdp_dbg_data(sk, "QP destroyed\n");
260 sdp_warn(sk, "Panic: Timed out waiting for RDMA read\n");
265 posts_handler_put(ssk);
267 sdp_prf1(sk, NULL, "Going to sleep");
268 sk_wait_event(sk, &timeo,
269 !ssk->tx_ring.rdma_inflight->busy);
270 sdp_prf1(sk, NULL, "Woke up");
271 sdp_dbg_data(ssk->socket, "woke up sleepers\n");
273 posts_handler_get(ssk);
276 finish_wait(sk->sk_sleep, &wait);
278 sdp_dbg_data(sk, "Finished waiting\n");
281 int sdp_post_rdma_rd_compl(struct sdp_sock *ssk,
282 struct rx_srcavail_state *rx_sa)
285 int copied = rx_sa->used - rx_sa->reported;
287 if (rx_sa->used <= rx_sa->reported)
290 mb = sdp_alloc_mb_rdmardcompl(ssk->socket, copied, 0);
292 rx_sa->reported += copied;
294 /* TODO: What if no tx_credits available? */
295 sdp_post_send(ssk, mb);
300 int sdp_post_sendsm(struct socket *sk)
302 struct mbuf *mb = sdp_alloc_mb_sendsm(sk, 0);
304 sdp_post_send(sdp_sk(sk), mb);
309 static int sdp_update_iov_used(struct socket *sk, struct iovec *iov, int len)
311 sdp_dbg_data(sk, "updating consumed 0x%x bytes from iov\n", len);
314 int copy = min_t(unsigned int, iov->iov_len, len);
316 iov->iov_len -= copy;
317 iov->iov_base += copy;
325 static inline int sge_bytes(struct ib_sge *sge, int sge_cnt)
329 while (sge_cnt > 0) {
330 bytes += sge->length;
337 void sdp_handle_sendsm(struct sdp_sock *ssk, u32 mseq_ack)
339 struct socket *sk = ssk->socket;
342 spin_lock_irqsave(&ssk->tx_sa_lock, flags);
345 sdp_prf1(sk, NULL, "SendSM for cancelled/finished SrcAvail");
349 if (ssk->tx_sa->mseq > mseq_ack) {
350 sdp_dbg_data(sk, "SendSM arrived for old SrcAvail. "
351 "SendSM mseq_ack: 0x%x, SrcAvail mseq: 0x%x\n",
352 mseq_ack, ssk->tx_sa->mseq);
356 sdp_dbg_data(sk, "Got SendSM - aborting SrcAvail\n");
358 ssk->tx_sa->abort_flags |= TX_SA_SENDSM;
359 cancel_delayed_work(&ssk->srcavail_cancel_work);
361 wake_up(sk->sk_sleep);
362 sdp_dbg_data(sk, "woke up sleepers\n");
365 spin_unlock_irqrestore(&ssk->tx_sa_lock, flags);
368 void sdp_handle_rdma_read_compl(struct sdp_sock *ssk, u32 mseq_ack,
371 struct socket *sk = ssk->socket;
374 sdp_prf1(sk, NULL, "RdmaRdCompl ssk=%p tx_sa=%p", ssk, ssk->tx_sa);
375 sdp_dbg_data(sk, "RdmaRdCompl ssk=%p tx_sa=%p\n", ssk, ssk->tx_sa);
377 spin_lock_irqsave(&ssk->tx_sa_lock, flags);
382 sdp_dbg_data(sk, "Got RdmaRdCompl for aborted SrcAvail\n");
386 if (ssk->tx_sa->mseq > mseq_ack) {
387 sdp_dbg_data(sk, "RdmaRdCompl arrived for old SrcAvail. "
388 "SendSM mseq_ack: 0x%x, SrcAvail mseq: 0x%x\n",
389 mseq_ack, ssk->tx_sa->mseq);
393 ssk->tx_sa->bytes_acked += bytes_completed;
395 wake_up(sk->sk_sleep);
396 sdp_dbg_data(sk, "woke up sleepers\n");
399 spin_unlock_irqrestore(&ssk->tx_sa_lock, flags);
403 static unsigned long sdp_get_max_memlockable_bytes(unsigned long offset)
406 unsigned long lock_limit;
408 if (capable(CAP_IPC_LOCK))
411 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
412 avail = lock_limit - (current->mm->locked_vm << PAGE_SHIFT);
414 return avail - offset;
417 static int sdp_alloc_fmr(struct socket *sk, void *uaddr, size_t len,
418 struct ib_pool_fmr **_fmr, struct ib_umem **_umem)
420 struct ib_pool_fmr *fmr;
421 struct ib_umem *umem;
422 struct ib_device *dev;
424 struct ib_umem_chunk *chunk;
427 unsigned long max_lockable_bytes;
429 if (unlikely(len > SDP_MAX_RDMA_READ_LEN)) {
430 sdp_dbg_data(sk, "len:0x%lx > FMR_SIZE: 0x%lx\n",
431 len, SDP_MAX_RDMA_READ_LEN);
432 len = SDP_MAX_RDMA_READ_LEN;
435 max_lockable_bytes = sdp_get_max_memlockable_bytes((unsigned long)uaddr & ~PAGE_MASK);
436 if (unlikely(len > max_lockable_bytes)) {
437 sdp_dbg_data(sk, "len:0x%lx > RLIMIT_MEMLOCK available: 0x%lx\n",
438 len, max_lockable_bytes);
439 len = max_lockable_bytes;
442 sdp_dbg_data(sk, "user buf: %p, len:0x%lx max_lockable_bytes: 0x%lx\n",
443 uaddr, len, max_lockable_bytes);
445 umem = ib_umem_get(&sdp_sk(sk)->context, (unsigned long)uaddr, len,
446 IB_ACCESS_REMOTE_WRITE, 0);
450 sdp_warn(sk, "Error doing umem_get 0x%lx bytes: %d\n", len, rc);
451 sdp_warn(sk, "RLIMIT_MEMLOCK: 0x%lx[cur] 0x%lx[max] CAP_IPC_LOCK: %d\n",
452 current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur,
453 current->signal->rlim[RLIMIT_MEMLOCK].rlim_max,
454 capable(CAP_IPC_LOCK));
458 sdp_dbg_data(sk, "umem->offset = 0x%x, length = 0x%lx\n",
459 umem->offset, umem->length);
461 pages = (u64 *) __get_free_page(GFP_KERNEL);
463 goto err_pages_alloc;
467 dev = sdp_sk(sk)->ib_device;
468 list_for_each_entry(chunk, &umem->chunk_list, list) {
469 for (j = 0; j < chunk->nmap; ++j) {
470 len = ib_sg_dma_len(dev,
471 &chunk->page_list[j]) >> PAGE_SHIFT;
473 for (k = 0; k < len; ++k) {
474 pages[n++] = ib_sg_dma_address(dev,
475 &chunk->page_list[j]) +
482 fmr = ib_fmr_pool_map_phys(sdp_sk(sk)->sdp_dev->fmr_pool, pages, n, 0);
484 sdp_warn(sk, "Error allocating fmr: %ld\n", PTR_ERR(fmr));
488 free_page((unsigned long) pages);
496 free_page((unsigned long) pages);
499 ib_umem_release(umem);
506 void sdp_free_fmr(struct socket *sk, struct ib_pool_fmr **_fmr, struct ib_umem **_umem)
508 if (!sdp_sk(sk)->qp_active)
511 ib_fmr_pool_unmap(*_fmr);
514 ib_umem_release(*_umem);
518 static int sdp_post_rdma_read(struct socket *sk, struct rx_srcavail_state *rx_sa)
520 struct sdp_sock *ssk = sdp_sk(sk);
521 struct ib_send_wr *bad_wr;
522 struct ib_send_wr wr = { NULL };
525 wr.opcode = IB_WR_RDMA_READ;
527 wr.wr_id = SDP_OP_RDMA;
528 wr.wr.rdma.rkey = rx_sa->rkey;
531 ssk->tx_ring.rdma_inflight = rx_sa;
533 sge.addr = rx_sa->umem->offset;
534 sge.length = rx_sa->umem->length;
535 sge.lkey = rx_sa->fmr->fmr->lkey;
537 wr.wr.rdma.remote_addr = rx_sa->vaddr + rx_sa->used;
542 wr.send_flags = IB_SEND_SIGNALED;
544 return ib_post_send(ssk->qp, &wr, &bad_wr);
547 int sdp_rdma_to_iovec(struct socket *sk, struct iovec *iov, struct mbuf *mb,
550 struct sdp_sock *ssk = sdp_sk(sk);
551 struct rx_srcavail_state *rx_sa = RX_SRCAVAIL_STATE(mb);
552 int got_srcavail_cancel;
557 sdp_dbg_data(ssk->socket, "preparing RDMA read."
558 " len: 0x%x. buffer len: 0x%lx\n", len, iov->iov_len);
560 sock_hold(sk, SOCK_REF_RDMA_RD);
562 if (len > rx_sa->len) {
563 sdp_warn(sk, "len:0x%x > rx_sa->len: 0x%x\n", len, rx_sa->len);
568 rc = sdp_alloc_fmr(sk, iov->iov_base, len, &rx_sa->fmr, &rx_sa->umem);
570 sdp_warn(sk, "Error allocating fmr: %d\n", rc);
574 rc = sdp_post_rdma_read(sk, rx_sa);
576 sdp_warn(sk, "ib_post_send failed with status %d.\n", rc);
577 sdp_set_error(ssk->socket, -ECONNRESET);
582 sdp_prf(sk, mb, "Finished posting(rc=%d), now to wait", rc);
584 got_srcavail_cancel = ssk->srcavail_cancel_mseq > rx_sa->mseq;
588 sdp_wait_rdma_wr_finished(ssk);
590 sdp_prf(sk, mb, "Finished waiting(rc=%d)", rc);
591 if (!ssk->qp_active) {
592 sdp_dbg_data(sk, "QP destroyed during RDMA read\n");
597 copied = rx_sa->umem->length;
599 sdp_update_iov_used(sk, iov, copied);
600 rx_sa->used += copied;
601 atomic_add(copied, &ssk->rcv_nxt);
604 ssk->tx_ring.rdma_inflight = NULL;
607 sdp_free_fmr(sk, &rx_sa->fmr, &rx_sa->umem);
610 if (rc && ssk->qp_active) {
611 sdp_warn(sk, "Couldn't do RDMA - post sendsm\n");
612 rx_sa->flags |= RX_SA_ABORTED;
615 sock_put(sk, SOCK_REF_RDMA_RD);
620 static inline int wait_for_sndbuf(struct socket *sk, long *timeo_p)
622 struct sdp_sock *ssk = sdp_sk(sk);
624 int credits_needed = 1;
626 sdp_dbg_data(sk, "Wait for mem\n");
628 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
630 SDPSTATS_COUNTER_INC(send_wait_for_mem);
634 sdp_xmit_poll(ssk, 1);
636 ret = sdp_tx_wait_memory(ssk, timeo_p, &credits_needed);
641 static int do_sdp_sendmsg_zcopy(struct socket *sk, struct tx_srcavail_state *tx_sa,
642 struct iovec *iov, long *timeo)
644 struct sdp_sock *ssk = sdp_sk(sk);
646 unsigned long lock_flags;
648 rc = sdp_alloc_fmr(sk, iov->iov_base, iov->iov_len,
649 &tx_sa->fmr, &tx_sa->umem);
651 sdp_warn(sk, "Error allocating fmr: %d\n", rc);
655 if (tx_slots_free(ssk) == 0) {
656 rc = wait_for_sndbuf(sk, timeo);
658 sdp_warn(sk, "Couldn't get send buffer\n");
659 goto err_no_tx_slots;
663 rc = sdp_post_srcavail(sk, tx_sa);
665 sdp_dbg(sk, "Error posting SrcAvail\n");
669 rc = sdp_wait_rdmardcompl(ssk, timeo, 0);
671 enum tx_sa_flag f = tx_sa->abort_flags;
673 if (f & TX_SA_SENDSM) {
674 sdp_dbg_data(sk, "Got SendSM. use SEND verb.\n");
675 } else if (f & TX_SA_ERROR) {
676 sdp_dbg_data(sk, "SrcAvail error completion\n");
678 SDPSTATS_COUNTER_INC(zcopy_tx_error);
679 } else if (ssk->qp_active) {
680 sdp_post_srcavail_cancel(sk);
682 /* Wait for RdmaRdCompl/SendSM to
683 * finish the transaction */
685 sdp_dbg_data(sk, "Waiting for SendSM\n");
686 sdp_wait_rdmardcompl(ssk, timeo, 1);
687 sdp_dbg_data(sk, "finished waiting\n");
689 cancel_delayed_work(&ssk->srcavail_cancel_work);
691 sdp_dbg_data(sk, "QP was destroyed while waiting\n");
694 sdp_dbg_data(sk, "got RdmaRdCompl\n");
697 spin_lock_irqsave(&ssk->tx_sa_lock, lock_flags);
699 spin_unlock_irqrestore(&ssk->tx_sa_lock, lock_flags);
702 sdp_update_iov_used(sk, iov, tx_sa->bytes_acked);
705 sdp_free_fmr(sk, &tx_sa->fmr, &tx_sa->umem);
711 int sdp_sendmsg_zcopy(struct kiocb *iocb, struct socket *sk, struct iovec *iov)
713 struct sdp_sock *ssk = sdp_sk(sk);
716 struct tx_srcavail_state *tx_sa;
718 size_t bytes_to_copy = 0;
721 sdp_dbg_data(sk, "Sending iov: %p, iov_len: 0x%lx\n",
722 iov->iov_base, iov->iov_len);
723 sdp_prf1(sk, NULL, "sdp_sendmsg_zcopy start");
725 sdp_dbg_data(sk, "Deadlock prevent: crossing SrcAvail\n");
729 sock_hold(ssk->socket, SOCK_REF_ZCOPY);
731 SDPSTATS_COUNTER_INC(sendmsg_zcopy_segment);
733 timeo = SDP_SRCAVAIL_ADV_TIMEOUT ;
735 /* Ok commence sending. */
736 offset = (unsigned long)iov->iov_base & (PAGE_SIZE - 1);
738 tx_sa = kmalloc(sizeof(struct tx_srcavail_state), GFP_KERNEL);
740 sdp_warn(sk, "Error allocating zcopy context\n");
741 rc = -EAGAIN; /* Buffer too big - fallback to bcopy */
742 goto err_alloc_tx_sa;
745 bytes_to_copy = iov->iov_len;
749 rc = do_sdp_sendmsg_zcopy(sk, tx_sa, iov, &timeo);
751 if (iov->iov_len && iov->iov_len < sdp_zcopy_thresh) {
752 sdp_dbg_data(sk, "0x%lx bytes left, switching to bcopy\n",
756 } while (!rc && iov->iov_len > 0 && !tx_sa->abort_flags);
760 copied = bytes_to_copy - iov->iov_len;
762 sdp_prf1(sk, NULL, "sdp_sendmsg_zcopy end rc: %d copied: %d", rc, copied);
764 sock_put(ssk->socket, SOCK_REF_ZCOPY);
766 if (rc < 0 && rc != -EAGAIN && rc != -ETIME)
772 void sdp_abort_srcavail(struct socket *sk)
774 struct sdp_sock *ssk = sdp_sk(sk);
775 struct tx_srcavail_state *tx_sa = ssk->tx_sa;
781 cancel_delayed_work(&ssk->srcavail_cancel_work);
782 flush_scheduled_work();
784 spin_lock_irqsave(&ssk->tx_sa_lock, flags);
786 sdp_free_fmr(sk, &tx_sa->fmr, &tx_sa->umem);
790 spin_unlock_irqrestore(&ssk->tx_sa_lock, flags);
793 void sdp_abort_rdma_read(struct socket *sk)
795 struct sdp_sock *ssk = sdp_sk(sk);
796 struct rx_srcavail_state *rx_sa = ssk->rx_sa;
801 sdp_free_fmr(sk, &rx_sa->fmr, &rx_sa->umem);