]> CyberLeo.Net >> Repos - FreeBSD/releng/10.2.git/blob - sys/ofed/drivers/infiniband/hw/mlx4/mr.c
- Copy stable/10@285827 to releng/10.2 in preparation for 10.2-RC1
[FreeBSD/releng/10.2.git] / sys / ofed / drivers / infiniband / hw / mlx4 / mr.c
1 /*
2  * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
3  * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33
34 #include <linux/slab.h>
35 #include <linux/module.h>
36 #include <linux/sched.h>
37
38 #ifdef __linux__
39 #include <linux/proc_fs.h>
40 #include <linux/cred.h>
41 #endif
42
43 #include "mlx4_ib.h"
44
45 static u32 convert_access(int acc)
46 {
47         return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC       : 0) |
48                (acc & IB_ACCESS_REMOTE_WRITE  ? MLX4_PERM_REMOTE_WRITE : 0) |
49                (acc & IB_ACCESS_REMOTE_READ   ? MLX4_PERM_REMOTE_READ  : 0) |
50                (acc & IB_ACCESS_LOCAL_WRITE   ? MLX4_PERM_LOCAL_WRITE  : 0) |
51                MLX4_PERM_LOCAL_READ;
52 }
53 #ifdef __linux__
54 static ssize_t shared_mr_proc_read(struct file *file,
55                           char __user *buffer,
56                           size_t len,
57                           loff_t *offset)
58 {
59
60         return -ENOSYS;
61
62 }
63
64 static ssize_t shared_mr_proc_write(struct file *file,
65                            const char __user *buffer,
66                            size_t len,
67                            loff_t *offset)
68 {
69
70         return -ENOSYS;
71 }
72
73 static int shared_mr_mmap(struct file *filep, struct vm_area_struct *vma)
74 {
75
76         struct proc_dir_entry *pde = PDE(filep->f_path.dentry->d_inode);
77         struct mlx4_shared_mr_info *smr_info =
78                 (struct mlx4_shared_mr_info *)pde->data;
79
80         /* Prevent any mapping not on start of area */
81         if (vma->vm_pgoff != 0)
82                 return -EINVAL;
83
84         return ib_umem_map_to_vma(smr_info->umem,
85                                         vma);
86
87 }
88
89 static const struct file_operations shared_mr_proc_ops = {
90         .owner  = THIS_MODULE,
91         .read   = shared_mr_proc_read,
92         .write  = shared_mr_proc_write,
93         .mmap   = shared_mr_mmap
94 };
95
96 static mode_t convert_shared_access(int acc)
97 {
98
99         return (acc & IB_ACCESS_SHARED_MR_USER_READ ? S_IRUSR       : 0) |
100                (acc & IB_ACCESS_SHARED_MR_USER_WRITE  ? S_IWUSR : 0) |
101                (acc & IB_ACCESS_SHARED_MR_GROUP_READ   ? S_IRGRP  : 0) |
102                (acc & IB_ACCESS_SHARED_MR_GROUP_WRITE   ? S_IWGRP  : 0) |
103                (acc & IB_ACCESS_SHARED_MR_OTHER_READ   ? S_IROTH  : 0) |
104                (acc & IB_ACCESS_SHARED_MR_OTHER_WRITE   ? S_IWOTH  : 0);
105
106 }
107 #endif
108 struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
109 {
110         struct mlx4_ib_mr *mr;
111         int err;
112
113         mr = kzalloc(sizeof *mr, GFP_KERNEL);
114         if (!mr)
115                 return ERR_PTR(-ENOMEM);
116
117         err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0,
118                             ~0ull, convert_access(acc), 0, 0, &mr->mmr);
119         if (err)
120                 goto err_free;
121
122         err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr);
123         if (err)
124                 goto err_mr;
125
126         mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
127         mr->umem = NULL;
128
129         return &mr->ibmr;
130
131 err_mr:
132         mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
133
134 err_free:
135         kfree(mr);
136
137         return ERR_PTR(err);
138 }
139
140 static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev,
141                                                 struct mlx4_mtt *mtt,
142                                                 u64 mtt_size,
143                                                 u64 mtt_shift,
144                                                 u64 len,
145                                                 u64 cur_start_addr,
146                                                 u64 *pages,
147                                                 int *start_index,
148                                                 int *npages)
149 {
150         int k;
151         int err = 0;
152         u64 mtt_entries;
153         u64 cur_end_addr = cur_start_addr + len;
154         u64 cur_end_addr_aligned = 0;
155
156         len += (cur_start_addr & (mtt_size-1ULL));
157         cur_end_addr_aligned = round_up(cur_end_addr, mtt_size);
158         len += (cur_end_addr_aligned - cur_end_addr);
159         if (len & (mtt_size-1ULL)) {
160                 WARN(1 ,
161                 "write_block: len %llx is not aligned to mtt_size %llx\n",
162                         (long long)len, (long long)mtt_size);
163                 return -EINVAL;
164         }
165
166
167         mtt_entries = (len >> mtt_shift);
168
169         /* Align the MTT start address to
170                 the mtt_size.
171                 Required to handle cases when the MR
172                 starts in the middle of an MTT record.
173                 Was not required in old code since
174                 the physical addresses provided by
175                 the dma subsystem were page aligned,
176                 which was also the MTT size.
177         */
178         cur_start_addr = round_down(cur_start_addr, mtt_size);
179         /* A new block is started ...*/
180         for (k = 0; k < mtt_entries; ++k) {
181                 pages[*npages] = cur_start_addr + (mtt_size * k);
182                 (*npages)++;
183                 /*
184                  * Be friendly to mlx4_write_mtt() and
185                  * pass it chunks of appropriate size.
186                  */
187                 if (*npages == PAGE_SIZE / sizeof(u64)) {
188                         err = mlx4_write_mtt(dev->dev,
189                                         mtt, *start_index,
190                                         *npages, pages);
191                         if (err)
192                                 return err;
193
194                         (*start_index) += *npages;
195                         *npages = 0;
196                 }
197         }
198
199         return 0;
200 }
201
202 int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
203                            struct ib_umem *umem)
204 {
205         u64 *pages;
206         struct ib_umem_chunk *chunk;
207         int j;
208         u64 len = 0;
209         int err = 0;
210         u64 mtt_size;
211         u64 cur_start_addr = 0;
212         u64 mtt_shift;
213         int start_index = 0;
214         int npages = 0;
215
216         pages = (u64 *) __get_free_page(GFP_KERNEL);
217         if (!pages)
218                 return -ENOMEM;
219
220         mtt_shift = mtt->page_shift;
221         mtt_size = 1ULL << mtt_shift;
222
223         list_for_each_entry(chunk, &umem->chunk_list, list)
224                 for (j = 0; j < chunk->nmap; ++j) {
225                         if (cur_start_addr + len ==
226                             sg_dma_address(&chunk->page_list[j])) {
227                                 /* still the same block */
228                                 len += sg_dma_len(&chunk->page_list[j]);
229                                 continue;
230                         }
231                         /* A new block is started ...*/
232                         /* If len is malaligned, write an extra mtt entry to
233                             cover the misaligned area (round up the division)
234                         */
235                         err = mlx4_ib_umem_write_mtt_block(dev,
236                                                 mtt, mtt_size, mtt_shift,
237                                                 len, cur_start_addr,
238                                                 pages,
239                                                 &start_index,
240                                                 &npages);
241                         if (err)
242                                 goto out;
243
244                         cur_start_addr =
245                                 sg_dma_address(&chunk->page_list[j]);
246                         len = sg_dma_len(&chunk->page_list[j]);
247                 }
248
249         /* Handle the last block */
250         if (len > 0) {
251                 /*  If len is malaligned, write an extra mtt entry to cover
252                      the misaligned area (round up the division)
253                 */
254                 err = mlx4_ib_umem_write_mtt_block(dev,
255                                                 mtt, mtt_size, mtt_shift,
256                                                 len, cur_start_addr,
257                                                 pages,
258                                                 &start_index,
259                                                 &npages);
260                         if (err)
261                                 goto out;
262         }
263
264
265         if (npages)
266                 err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages);
267
268 out:
269         free_page((unsigned long) pages);
270         return err;
271 }
272
273 static inline u64 alignment_of(u64 ptr)
274 {
275         return ilog2(ptr & (~(ptr-1)));
276 }
277
278 static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start,
279                                                 u64 current_block_end,
280                                                 u64 block_shift)
281 {
282         /* Check whether the alignment of the new block
283              is aligned as well as the previous block.
284              Block address must start with zeros till size of entity_size.
285         */
286         if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0)
287                 /* It is not as well aligned as the
288                 previous block-reduce the mtt size
289                 accordingly.
290                 Here we take the last right bit
291                 which is 1.
292                 */
293                 block_shift = alignment_of(next_block_start);
294
295         /*  Check whether the alignment of the
296              end of previous block - is it aligned
297              as well as the start of the block
298         */
299         if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0)
300                 /* It is not as well aligned as
301                 the start of the block - reduce the
302                 mtt size accordingly.
303                 */
304                 block_shift = alignment_of(current_block_end);
305
306         return block_shift;
307 }
308
309 /* Calculate optimal mtt size based on contiguous pages.
310 * Function will return also the number of pages that are not aligned to the
311    calculated mtt_size to be added to total number
312     of pages. For that we should check the first chunk length & last chunk
313     length and if not aligned to mtt_size we should increment
314     the non_aligned_pages number.
315     All chunks in the middle already handled as part of mtt shift calculation
316     for both their start & end addresses.
317 */
318 int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
319                                                 u64 start_va,
320                                                 int *num_of_mtts)
321 {
322         struct ib_umem_chunk *chunk;
323         int j;
324         u64 block_shift = MLX4_MAX_MTT_SHIFT;
325         u64 current_block_len = 0;
326         u64 current_block_start = 0;
327         u64 misalignment_bits;
328         u64 first_block_start = 0;
329         u64 last_block_end = 0;
330         u64 total_len = 0;
331         u64 last_block_aligned_end = 0;
332         u64 min_shift = ilog2(umem->page_size);
333
334         list_for_each_entry(chunk, &umem->chunk_list, list) {
335                 /* Initialization - save the first chunk start as
336                     the current_block_start - block means contiguous pages.
337                 */
338                 if (current_block_len == 0 && current_block_start == 0) {
339                         first_block_start = current_block_start =
340                                 sg_dma_address(&chunk->page_list[0]);
341                         /* Find the bits that are different between
342                             the physical address and the virtual
343                             address for the start of the MR.
344                         */
345                         /* umem_get aligned the start_va to a page
346                            boundry. Therefore, we need to align the
347                            start va to the same boundry */
348                         /* misalignment_bits is needed to handle the
349                            case of a single memory region. In this
350                            case, the rest of the logic will not reduce
351                            the block size.  If we use a block size
352                            which is bigger than the alignment of the
353                            misalignment bits, we might use the virtual
354                            page number instead of the physical page
355                            number, resulting in access to the wrong
356                            data. */
357                         misalignment_bits =
358                         (start_va & (~(((u64)(umem->page_size))-1ULL)))
359                                                 ^ current_block_start;
360                         block_shift = min(alignment_of(misalignment_bits)
361                                 , block_shift);
362                 }
363
364                 /* Go over the scatter entries in the current chunk, check
365                      if they continue the previous scatter entry.
366                 */
367                 for (j = 0; j < chunk->nmap; ++j) {
368                         u64 next_block_start =
369                                 sg_dma_address(&chunk->page_list[j]);
370                         u64 current_block_end = current_block_start
371                                 + current_block_len;
372                         /* If we have a split (non-contig.) between two block*/
373                         if (current_block_end != next_block_start) {
374                                 block_shift = mlx4_ib_umem_calc_block_mtt(
375                                                 next_block_start,
376                                                 current_block_end,
377                                                 block_shift);
378
379                                 /* If we reached the minimum shift for 4k
380                                      page we stop the loop.
381                                 */
382                                 if (block_shift <= min_shift)
383                                         goto end;
384
385                                 /* If not saved yet we are in first block -
386                                      we save the length of first block to
387                                      calculate the non_aligned_pages number at
388                                 *    the end.
389                                 */
390                                 total_len += current_block_len;
391
392                                 /* Start a new block */
393                                 current_block_start = next_block_start;
394                                 current_block_len =
395                                         sg_dma_len(&chunk->page_list[j]);
396                                 continue;
397                         }
398                         /* The scatter entry is another part of
399                              the current block, increase the block size
400                         * An entry in the scatter can be larger than
401                         4k (page) as of dma mapping
402                         which merge some blocks together.
403                         */
404                         current_block_len +=
405                                 sg_dma_len(&chunk->page_list[j]);
406                 }
407         }
408
409         /* Account for the last block in the total len */
410         total_len += current_block_len;
411         /* Add to the first block the misalignment that it suffers from.*/
412         total_len += (first_block_start & ((1ULL<<block_shift)-1ULL));
413         last_block_end = current_block_start+current_block_len;
414         last_block_aligned_end = round_up(last_block_end, 1<<block_shift);
415         total_len += (last_block_aligned_end - last_block_end);
416
417         WARN((total_len & ((1ULL<<block_shift)-1ULL)),
418                 " misaligned total length detected (%llu, %llu)!",
419                 (long long)total_len, (long long)block_shift);
420
421         *num_of_mtts = total_len >> block_shift;
422 end:
423         if (block_shift < min_shift) {
424                 /* If shift is less than the min we set a WARN and
425                      return the min shift.
426                 */
427                 WARN(1,
428                 "mlx4_ib_umem_calc_optimal_mtt_size - unexpected shift %lld\n",
429                 (long long)block_shift);
430
431                 block_shift = min_shift;
432         }
433         return block_shift;
434 }
435
436 #ifdef __linux__
437 static int prepare_shared_mr(struct mlx4_ib_mr *mr, int access_flags, int mr_id)
438 {
439         struct proc_dir_entry *mr_proc_entry;
440         mode_t mode = S_IFREG;
441         char name_buff[16];
442
443         mode |= convert_shared_access(access_flags);
444         sprintf(name_buff, "%X", mr_id);
445         mr->smr_info = kmalloc(sizeof(struct mlx4_shared_mr_info), GFP_KERNEL);
446         mr->smr_info->mr_id = mr_id;
447         mr->smr_info->umem = mr->umem;
448
449         mr_proc_entry = proc_create_data(name_buff, mode,
450                                 mlx4_mrs_dir_entry,
451                                 &shared_mr_proc_ops,
452                                 mr->smr_info);
453
454         if (!mr_proc_entry) {
455                 pr_err("prepare_shared_mr failed via proc\n");
456                 kfree(mr->smr_info);
457                 return -ENODEV;
458         }
459
460         current_uid_gid(&(mr_proc_entry->uid), &(mr_proc_entry->gid));
461         mr_proc_entry->size = mr->umem->length;
462         return 0;
463
464 }
465 static int is_shared_mr(int access_flags)
466 {
467         /* We should check whether IB_ACCESS_SHARED_MR_USER_READ or
468         other shared bits were turned on.
469         */
470         return !!(access_flags & (IB_ACCESS_SHARED_MR_USER_READ |
471                                 IB_ACCESS_SHARED_MR_USER_WRITE |
472                                 IB_ACCESS_SHARED_MR_GROUP_READ |
473                                 IB_ACCESS_SHARED_MR_GROUP_WRITE |
474                                 IB_ACCESS_SHARED_MR_OTHER_READ |
475                                 IB_ACCESS_SHARED_MR_OTHER_WRITE));
476
477 }
478 #endif
479
480 struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
481                                   u64 virt_addr, int access_flags,
482                                   struct ib_udata *udata,
483                                   int mr_id)
484 {
485         struct mlx4_ib_dev *dev = to_mdev(pd->device);
486         struct mlx4_ib_mr *mr;
487         int shift;
488         int err;
489         int n;
490
491         mr = kzalloc(sizeof *mr, GFP_KERNEL);
492         if (!mr)
493                 return ERR_PTR(-ENOMEM);
494
495         mr->umem = ib_umem_get(pd->uobject->context, start, length,
496                         access_flags, 0);
497         if (IS_ERR(mr->umem)) {
498                 err = PTR_ERR(mr->umem);
499                 goto err_free;
500         }
501
502         n = ib_umem_page_count(mr->umem);
503         shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start,
504                 &n);
505         err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
506                          convert_access(access_flags), n, shift, &mr->mmr);
507         if (err)
508                 goto err_umem;
509
510         err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
511         if (err)
512                 goto err_mr;
513
514         err = mlx4_mr_enable(dev->dev, &mr->mmr);
515         if (err)
516                 goto err_mr;
517
518         mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
519 #ifdef __linux__
520         /* Check whether MR should be shared */
521         if (is_shared_mr(access_flags)) {
522         /* start address and length must be aligned to page size in order
523             to map a full page and preventing leakage of data */
524                 if (mr->umem->offset || (length & ~PAGE_MASK)) {
525                         err = -EINVAL;
526                         goto err_mr;
527                 }
528
529                 err = prepare_shared_mr(mr, access_flags, mr_id);
530                 if (err)
531                         goto err_mr;
532         }
533 #endif
534         return &mr->ibmr;
535
536 err_mr:
537         mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
538
539 err_umem:
540         ib_umem_release(mr->umem);
541
542 err_free:
543         kfree(mr);
544
545         return ERR_PTR(err);
546 }
547
548
549 int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
550 {
551         struct mlx4_ib_mr *mr = to_mmr(ibmr);
552
553         mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
554         if (mr->smr_info) {
555                 /* When master/parent shared mr is dereged there is
556                 no ability to share this mr any more - its mr_id will be
557                 returned to the kernel as part of ib_uverbs_dereg_mr
558                 and may be allocated again as part of other reg_mr.
559                 */
560                 char name_buff[16];
561
562                 sprintf(name_buff, "%X", mr->smr_info->mr_id);
563                 /* Remove proc entry is checking internally that no operation
564                     was strated on that proc fs file and if in the middle
565                     current process will wait till end of operation.
566                     That's why no sync mechanism is needed when we release
567                     below the shared umem.
568                 */
569 #ifdef __linux__
570                 remove_proc_entry(name_buff, mlx4_mrs_dir_entry);
571                 kfree(mr->smr_info);
572 #endif
573         }
574
575         if (mr->umem)
576                 ib_umem_release(mr->umem);
577
578         kfree(mr);
579
580         return 0;
581 }
582
583 struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
584                                         int max_page_list_len)
585 {
586         struct mlx4_ib_dev *dev = to_mdev(pd->device);
587         struct mlx4_ib_mr *mr;
588         int err;
589
590         mr = kzalloc(sizeof *mr, GFP_KERNEL);
591         if (!mr)
592                 return ERR_PTR(-ENOMEM);
593
594         err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0,
595                             max_page_list_len, 0, &mr->mmr);
596         if (err)
597                 goto err_free;
598
599         err = mlx4_mr_enable(dev->dev, &mr->mmr);
600         if (err)
601                 goto err_mr;
602
603         mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
604         mr->umem = NULL;
605
606         return &mr->ibmr;
607
608 err_mr:
609         mlx4_mr_free(dev->dev, &mr->mmr);
610
611 err_free:
612         kfree(mr);
613         return ERR_PTR(err);
614 }
615
616 struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
617                                                                int page_list_len)
618 {
619         struct mlx4_ib_dev *dev = to_mdev(ibdev);
620         struct mlx4_ib_fast_reg_page_list *mfrpl;
621         int size = page_list_len * sizeof (u64);
622
623         if (page_list_len > MLX4_MAX_FAST_REG_PAGES)
624                 return ERR_PTR(-EINVAL);
625
626         mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL);
627         if (!mfrpl)
628                 return ERR_PTR(-ENOMEM);
629
630         mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL);
631         if (!mfrpl->ibfrpl.page_list)
632                 goto err_free;
633
634         mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->pdev->dev,
635                                                      size, &mfrpl->map,
636                                                      GFP_KERNEL);
637         if (!mfrpl->mapped_page_list)
638                 goto err_free;
639
640         WARN_ON(mfrpl->map & 0x3f);
641
642         return &mfrpl->ibfrpl;
643
644 err_free:
645         kfree(mfrpl->ibfrpl.page_list);
646         kfree(mfrpl);
647         return ERR_PTR(-ENOMEM);
648 }
649
650 void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
651 {
652         struct mlx4_ib_dev *dev = to_mdev(page_list->device);
653         struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
654         int size = page_list->max_page_list_len * sizeof (u64);
655
656         dma_free_coherent(&dev->dev->pdev->dev, size, mfrpl->mapped_page_list,
657                           mfrpl->map);
658         kfree(mfrpl->ibfrpl.page_list);
659         kfree(mfrpl);
660 }
661
662 struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc,
663                                  struct ib_fmr_attr *fmr_attr)
664 {
665         struct mlx4_ib_dev *dev = to_mdev(pd->device);
666         struct mlx4_ib_fmr *fmr;
667         int err = -ENOMEM;
668
669         fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
670         if (!fmr)
671                 return ERR_PTR(-ENOMEM);
672
673         err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc),
674                              fmr_attr->max_pages, fmr_attr->max_maps,
675                              fmr_attr->page_shift, &fmr->mfmr);
676         if (err)
677                 goto err_free;
678
679         err = mlx4_fmr_enable(to_mdev(pd->device)->dev, &fmr->mfmr);
680         if (err)
681                 goto err_mr;
682
683         fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key;
684
685         return &fmr->ibfmr;
686
687 err_mr:
688         mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr);
689
690 err_free:
691         kfree(fmr);
692
693         return ERR_PTR(err);
694 }
695
696 int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
697                       int npages, u64 iova)
698 {
699         struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
700         struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device);
701
702         return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova,
703                                  &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
704 }
705
706 int mlx4_ib_unmap_fmr(struct list_head *fmr_list)
707 {
708         struct ib_fmr *ibfmr;
709         int err;
710         struct mlx4_dev *mdev = NULL;
711
712         list_for_each_entry(ibfmr, fmr_list, list) {
713                 if (mdev && to_mdev(ibfmr->device)->dev != mdev)
714                         return -EINVAL;
715                 mdev = to_mdev(ibfmr->device)->dev;
716         }
717
718         if (!mdev)
719                 return 0;
720
721         list_for_each_entry(ibfmr, fmr_list, list) {
722                 struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
723
724                 mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
725         }
726
727         /*
728          * Make sure all MPT status updates are visible before issuing
729          * SYNC_TPT firmware command.
730          */
731         wmb();
732
733         err = mlx4_SYNC_TPT(mdev);
734         if (err)
735                 pr_warn("SYNC_TPT error %d when "
736                        "unmapping FMRs\n", err);
737
738         return 0;
739 }
740
741 int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr)
742 {
743         struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
744         struct mlx4_ib_dev *dev = to_mdev(ibfmr->device);
745         int err;
746
747         err = mlx4_fmr_free(dev->dev, &ifmr->mfmr);
748
749         if (!err)
750                 kfree(ifmr);
751
752         return err;
753 }