2 * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
3 * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34 #include <linux/slab.h>
35 #include <linux/module.h>
36 #include <linux/sched.h>
39 #include <linux/proc_fs.h>
40 #include <linux/cred.h>
45 static u32 convert_access(int acc)
47 return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC : 0) |
48 (acc & IB_ACCESS_REMOTE_WRITE ? MLX4_PERM_REMOTE_WRITE : 0) |
49 (acc & IB_ACCESS_REMOTE_READ ? MLX4_PERM_REMOTE_READ : 0) |
50 (acc & IB_ACCESS_LOCAL_WRITE ? MLX4_PERM_LOCAL_WRITE : 0) |
54 static ssize_t shared_mr_proc_read(struct file *file,
64 static ssize_t shared_mr_proc_write(struct file *file,
65 const char __user *buffer,
73 static int shared_mr_mmap(struct file *filep, struct vm_area_struct *vma)
76 struct proc_dir_entry *pde = PDE(filep->f_path.dentry->d_inode);
77 struct mlx4_shared_mr_info *smr_info =
78 (struct mlx4_shared_mr_info *)pde->data;
80 /* Prevent any mapping not on start of area */
81 if (vma->vm_pgoff != 0)
84 return ib_umem_map_to_vma(smr_info->umem,
89 static const struct file_operations shared_mr_proc_ops = {
91 .read = shared_mr_proc_read,
92 .write = shared_mr_proc_write,
93 .mmap = shared_mr_mmap
96 static mode_t convert_shared_access(int acc)
99 return (acc & IB_ACCESS_SHARED_MR_USER_READ ? S_IRUSR : 0) |
100 (acc & IB_ACCESS_SHARED_MR_USER_WRITE ? S_IWUSR : 0) |
101 (acc & IB_ACCESS_SHARED_MR_GROUP_READ ? S_IRGRP : 0) |
102 (acc & IB_ACCESS_SHARED_MR_GROUP_WRITE ? S_IWGRP : 0) |
103 (acc & IB_ACCESS_SHARED_MR_OTHER_READ ? S_IROTH : 0) |
104 (acc & IB_ACCESS_SHARED_MR_OTHER_WRITE ? S_IWOTH : 0);
108 struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
110 struct mlx4_ib_mr *mr;
113 mr = kzalloc(sizeof *mr, GFP_KERNEL);
115 return ERR_PTR(-ENOMEM);
117 err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0,
118 ~0ull, convert_access(acc), 0, 0, &mr->mmr);
122 err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr);
126 mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
132 mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
140 static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev,
141 struct mlx4_mtt *mtt,
153 u64 cur_end_addr = cur_start_addr + len;
154 u64 cur_end_addr_aligned = 0;
156 len += (cur_start_addr & (mtt_size-1ULL));
157 cur_end_addr_aligned = round_up(cur_end_addr, mtt_size);
158 len += (cur_end_addr_aligned - cur_end_addr);
159 if (len & (mtt_size-1ULL)) {
161 "write_block: len %llx is not aligned to mtt_size %llx\n",
162 (long long)len, (long long)mtt_size);
167 mtt_entries = (len >> mtt_shift);
169 /* Align the MTT start address to
171 Required to handle cases when the MR
172 starts in the middle of an MTT record.
173 Was not required in old code since
174 the physical addresses provided by
175 the dma subsystem were page aligned,
176 which was also the MTT size.
178 cur_start_addr = round_down(cur_start_addr, mtt_size);
179 /* A new block is started ...*/
180 for (k = 0; k < mtt_entries; ++k) {
181 pages[*npages] = cur_start_addr + (mtt_size * k);
184 * Be friendly to mlx4_write_mtt() and
185 * pass it chunks of appropriate size.
187 if (*npages == PAGE_SIZE / sizeof(u64)) {
188 err = mlx4_write_mtt(dev->dev,
194 (*start_index) += *npages;
202 int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
203 struct ib_umem *umem)
206 struct ib_umem_chunk *chunk;
211 u64 cur_start_addr = 0;
216 pages = (u64 *) __get_free_page(GFP_KERNEL);
220 mtt_shift = mtt->page_shift;
221 mtt_size = 1ULL << mtt_shift;
223 list_for_each_entry(chunk, &umem->chunk_list, list)
224 for (j = 0; j < chunk->nmap; ++j) {
225 if (cur_start_addr + len ==
226 sg_dma_address(&chunk->page_list[j])) {
227 /* still the same block */
228 len += sg_dma_len(&chunk->page_list[j]);
231 /* A new block is started ...*/
232 /* If len is malaligned, write an extra mtt entry to
233 cover the misaligned area (round up the division)
235 err = mlx4_ib_umem_write_mtt_block(dev,
236 mtt, mtt_size, mtt_shift,
245 sg_dma_address(&chunk->page_list[j]);
246 len = sg_dma_len(&chunk->page_list[j]);
249 /* Handle the last block */
251 /* If len is malaligned, write an extra mtt entry to cover
252 the misaligned area (round up the division)
254 err = mlx4_ib_umem_write_mtt_block(dev,
255 mtt, mtt_size, mtt_shift,
266 err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages);
269 free_page((unsigned long) pages);
273 static inline u64 alignment_of(u64 ptr)
275 return ilog2(ptr & (~(ptr-1)));
278 static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start,
279 u64 current_block_end,
282 /* Check whether the alignment of the new block
283 is aligned as well as the previous block.
284 Block address must start with zeros till size of entity_size.
286 if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0)
287 /* It is not as well aligned as the
288 previous block-reduce the mtt size
290 Here we take the last right bit
293 block_shift = alignment_of(next_block_start);
295 /* Check whether the alignment of the
296 end of previous block - is it aligned
297 as well as the start of the block
299 if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0)
300 /* It is not as well aligned as
301 the start of the block - reduce the
302 mtt size accordingly.
304 block_shift = alignment_of(current_block_end);
309 /* Calculate optimal mtt size based on contiguous pages.
310 * Function will return also the number of pages that are not aligned to the
311 calculated mtt_size to be added to total number
312 of pages. For that we should check the first chunk length & last chunk
313 length and if not aligned to mtt_size we should increment
314 the non_aligned_pages number.
315 All chunks in the middle already handled as part of mtt shift calculation
316 for both their start & end addresses.
318 int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
322 struct ib_umem_chunk *chunk;
324 u64 block_shift = MLX4_MAX_MTT_SHIFT;
325 u64 current_block_len = 0;
326 u64 current_block_start = 0;
327 u64 misalignment_bits;
328 u64 first_block_start = 0;
329 u64 last_block_end = 0;
331 u64 last_block_aligned_end = 0;
332 u64 min_shift = ilog2(umem->page_size);
334 list_for_each_entry(chunk, &umem->chunk_list, list) {
335 /* Initialization - save the first chunk start as
336 the current_block_start - block means contiguous pages.
338 if (current_block_len == 0 && current_block_start == 0) {
339 first_block_start = current_block_start =
340 sg_dma_address(&chunk->page_list[0]);
341 /* Find the bits that are different between
342 the physical address and the virtual
343 address for the start of the MR.
345 /* umem_get aligned the start_va to a page
346 boundry. Therefore, we need to align the
347 start va to the same boundry */
348 /* misalignment_bits is needed to handle the
349 case of a single memory region. In this
350 case, the rest of the logic will not reduce
351 the block size. If we use a block size
352 which is bigger than the alignment of the
353 misalignment bits, we might use the virtual
354 page number instead of the physical page
355 number, resulting in access to the wrong
358 (start_va & (~(((u64)(umem->page_size))-1ULL)))
359 ^ current_block_start;
360 block_shift = min(alignment_of(misalignment_bits)
364 /* Go over the scatter entries in the current chunk, check
365 if they continue the previous scatter entry.
367 for (j = 0; j < chunk->nmap; ++j) {
368 u64 next_block_start =
369 sg_dma_address(&chunk->page_list[j]);
370 u64 current_block_end = current_block_start
372 /* If we have a split (non-contig.) between two block*/
373 if (current_block_end != next_block_start) {
374 block_shift = mlx4_ib_umem_calc_block_mtt(
379 /* If we reached the minimum shift for 4k
380 page we stop the loop.
382 if (block_shift <= min_shift)
385 /* If not saved yet we are in first block -
386 we save the length of first block to
387 calculate the non_aligned_pages number at
390 total_len += current_block_len;
392 /* Start a new block */
393 current_block_start = next_block_start;
395 sg_dma_len(&chunk->page_list[j]);
398 /* The scatter entry is another part of
399 the current block, increase the block size
400 * An entry in the scatter can be larger than
401 4k (page) as of dma mapping
402 which merge some blocks together.
405 sg_dma_len(&chunk->page_list[j]);
409 /* Account for the last block in the total len */
410 total_len += current_block_len;
411 /* Add to the first block the misalignment that it suffers from.*/
412 total_len += (first_block_start & ((1ULL<<block_shift)-1ULL));
413 last_block_end = current_block_start+current_block_len;
414 last_block_aligned_end = round_up(last_block_end, 1<<block_shift);
415 total_len += (last_block_aligned_end - last_block_end);
417 WARN((total_len & ((1ULL<<block_shift)-1ULL)),
418 " misaligned total length detected (%llu, %llu)!",
419 (long long)total_len, (long long)block_shift);
421 *num_of_mtts = total_len >> block_shift;
423 if (block_shift < min_shift) {
424 /* If shift is less than the min we set a WARN and
425 return the min shift.
428 "mlx4_ib_umem_calc_optimal_mtt_size - unexpected shift %lld\n",
429 (long long)block_shift);
431 block_shift = min_shift;
437 static int prepare_shared_mr(struct mlx4_ib_mr *mr, int access_flags, int mr_id)
439 struct proc_dir_entry *mr_proc_entry;
440 mode_t mode = S_IFREG;
443 mode |= convert_shared_access(access_flags);
444 sprintf(name_buff, "%X", mr_id);
445 mr->smr_info = kmalloc(sizeof(struct mlx4_shared_mr_info), GFP_KERNEL);
446 mr->smr_info->mr_id = mr_id;
447 mr->smr_info->umem = mr->umem;
449 mr_proc_entry = proc_create_data(name_buff, mode,
454 if (!mr_proc_entry) {
455 pr_err("prepare_shared_mr failed via proc\n");
460 current_uid_gid(&(mr_proc_entry->uid), &(mr_proc_entry->gid));
461 mr_proc_entry->size = mr->umem->length;
465 static int is_shared_mr(int access_flags)
467 /* We should check whether IB_ACCESS_SHARED_MR_USER_READ or
468 other shared bits were turned on.
470 return !!(access_flags & (IB_ACCESS_SHARED_MR_USER_READ |
471 IB_ACCESS_SHARED_MR_USER_WRITE |
472 IB_ACCESS_SHARED_MR_GROUP_READ |
473 IB_ACCESS_SHARED_MR_GROUP_WRITE |
474 IB_ACCESS_SHARED_MR_OTHER_READ |
475 IB_ACCESS_SHARED_MR_OTHER_WRITE));
480 struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
481 u64 virt_addr, int access_flags,
482 struct ib_udata *udata,
485 struct mlx4_ib_dev *dev = to_mdev(pd->device);
486 struct mlx4_ib_mr *mr;
491 mr = kzalloc(sizeof *mr, GFP_KERNEL);
493 return ERR_PTR(-ENOMEM);
495 mr->umem = ib_umem_get(pd->uobject->context, start, length,
497 if (IS_ERR(mr->umem)) {
498 err = PTR_ERR(mr->umem);
502 n = ib_umem_page_count(mr->umem);
503 shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start,
505 err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
506 convert_access(access_flags), n, shift, &mr->mmr);
510 err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
514 err = mlx4_mr_enable(dev->dev, &mr->mmr);
518 mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
520 /* Check whether MR should be shared */
521 if (is_shared_mr(access_flags)) {
522 /* start address and length must be aligned to page size in order
523 to map a full page and preventing leakage of data */
524 if (mr->umem->offset || (length & ~PAGE_MASK)) {
529 err = prepare_shared_mr(mr, access_flags, mr_id);
537 mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
540 ib_umem_release(mr->umem);
549 int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
551 struct mlx4_ib_mr *mr = to_mmr(ibmr);
553 mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
555 /* When master/parent shared mr is dereged there is
556 no ability to share this mr any more - its mr_id will be
557 returned to the kernel as part of ib_uverbs_dereg_mr
558 and may be allocated again as part of other reg_mr.
562 sprintf(name_buff, "%X", mr->smr_info->mr_id);
563 /* Remove proc entry is checking internally that no operation
564 was strated on that proc fs file and if in the middle
565 current process will wait till end of operation.
566 That's why no sync mechanism is needed when we release
567 below the shared umem.
570 remove_proc_entry(name_buff, mlx4_mrs_dir_entry);
576 ib_umem_release(mr->umem);
583 struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
584 int max_page_list_len)
586 struct mlx4_ib_dev *dev = to_mdev(pd->device);
587 struct mlx4_ib_mr *mr;
590 mr = kzalloc(sizeof *mr, GFP_KERNEL);
592 return ERR_PTR(-ENOMEM);
594 err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0,
595 max_page_list_len, 0, &mr->mmr);
599 err = mlx4_mr_enable(dev->dev, &mr->mmr);
603 mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
609 mlx4_mr_free(dev->dev, &mr->mmr);
616 struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
619 struct mlx4_ib_dev *dev = to_mdev(ibdev);
620 struct mlx4_ib_fast_reg_page_list *mfrpl;
621 int size = page_list_len * sizeof (u64);
623 if (page_list_len > MLX4_MAX_FAST_REG_PAGES)
624 return ERR_PTR(-EINVAL);
626 mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL);
628 return ERR_PTR(-ENOMEM);
630 mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL);
631 if (!mfrpl->ibfrpl.page_list)
634 mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->pdev->dev,
637 if (!mfrpl->mapped_page_list)
640 WARN_ON(mfrpl->map & 0x3f);
642 return &mfrpl->ibfrpl;
645 kfree(mfrpl->ibfrpl.page_list);
647 return ERR_PTR(-ENOMEM);
650 void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
652 struct mlx4_ib_dev *dev = to_mdev(page_list->device);
653 struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
654 int size = page_list->max_page_list_len * sizeof (u64);
656 dma_free_coherent(&dev->dev->pdev->dev, size, mfrpl->mapped_page_list,
658 kfree(mfrpl->ibfrpl.page_list);
662 struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc,
663 struct ib_fmr_attr *fmr_attr)
665 struct mlx4_ib_dev *dev = to_mdev(pd->device);
666 struct mlx4_ib_fmr *fmr;
669 fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
671 return ERR_PTR(-ENOMEM);
673 err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc),
674 fmr_attr->max_pages, fmr_attr->max_maps,
675 fmr_attr->page_shift, &fmr->mfmr);
679 err = mlx4_fmr_enable(to_mdev(pd->device)->dev, &fmr->mfmr);
683 fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key;
688 mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr);
696 int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
697 int npages, u64 iova)
699 struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
700 struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device);
702 return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova,
703 &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
706 int mlx4_ib_unmap_fmr(struct list_head *fmr_list)
708 struct ib_fmr *ibfmr;
710 struct mlx4_dev *mdev = NULL;
712 list_for_each_entry(ibfmr, fmr_list, list) {
713 if (mdev && to_mdev(ibfmr->device)->dev != mdev)
715 mdev = to_mdev(ibfmr->device)->dev;
721 list_for_each_entry(ibfmr, fmr_list, list) {
722 struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
724 mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
728 * Make sure all MPT status updates are visible before issuing
729 * SYNC_TPT firmware command.
733 err = mlx4_SYNC_TPT(mdev);
735 pr_warn("SYNC_TPT error %d when "
736 "unmapping FMRs\n", err);
741 int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr)
743 struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
744 struct mlx4_ib_dev *dev = to_mdev(ibfmr->device);
747 err = mlx4_fmr_free(dev->dev, &ifmr->mfmr);