1 /*
2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3 * Copyright (c) 2020, Intel Corporation. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
13 * conditions are met:
14 *
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
17 * disclaimer.
18 *
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34
35 #include <linux/kref.h>
36 #include <linux/random.h>
37 #include <linux/debugfs.h>
38 #include <linux/export.h>
39 #include <linux/delay.h>
40 #include <linux/dma-buf.h>
41 #include <linux/dma-resv.h>
42 #include <rdma/ib_umem.h>
43 #include <rdma/ib_umem_odp.h>
44 #include <rdma/ib_verbs.h>
45 #include "dm.h"
46 #include "mlx5_ib.h"
47
48 /*
49 * We can't use an array for xlt_emergency_page because dma_map_single doesn't
50 * work on kernel modules memory
51 */
52 void *xlt_emergency_page;
53 static DEFINE_MUTEX(xlt_emergency_page_mutex);
54
55 enum {
56 MAX_PENDING_REG_MR = 8,
57 };
58
59 #define MLX5_UMR_ALIGN 2048
60
61 static void
62 create_mkey_callback(int status, struct mlx5_async_work *context);
63 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
64 u64 iova, int access_flags,
65 unsigned int page_size, bool populate);
66
set_mkc_access_pd_addr_fields(void * mkc,int acc,u64 start_addr,struct ib_pd * pd)67 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
68 struct ib_pd *pd)
69 {
70 struct mlx5_ib_dev *dev = to_mdev(pd->device);
71 bool ro_pci_enabled = pcie_relaxed_ordering_enabled(dev->mdev->pdev);
72
73 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
74 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
75 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
76 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
77 MLX5_SET(mkc, mkc, lr, 1);
78
79 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
80 MLX5_SET(mkc, mkc, relaxed_ordering_write,
81 (acc & IB_ACCESS_RELAXED_ORDERING) && ro_pci_enabled);
82 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))
83 MLX5_SET(mkc, mkc, relaxed_ordering_read,
84 (acc & IB_ACCESS_RELAXED_ORDERING) && ro_pci_enabled);
85
86 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
87 MLX5_SET(mkc, mkc, qpn, 0xffffff);
88 MLX5_SET64(mkc, mkc, start_addr, start_addr);
89 }
90
assign_mkey_variant(struct mlx5_ib_dev * dev,struct mlx5_ib_mkey * mkey,u32 * in)91 static void assign_mkey_variant(struct mlx5_ib_dev *dev,
92 struct mlx5_ib_mkey *mkey, u32 *in)
93 {
94 u8 key = atomic_inc_return(&dev->mkey_var);
95 void *mkc;
96
97 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
98 MLX5_SET(mkc, mkc, mkey_7_0, key);
99 mkey->key = key;
100 }
101
mlx5_ib_create_mkey(struct mlx5_ib_dev * dev,struct mlx5_ib_mkey * mkey,u32 * in,int inlen)102 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
103 struct mlx5_ib_mkey *mkey, u32 *in, int inlen)
104 {
105 int ret;
106
107 assign_mkey_variant(dev, mkey, in);
108 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen);
109 if (!ret)
110 init_waitqueue_head(&mkey->wait);
111
112 return ret;
113 }
114
115 static int
mlx5_ib_create_mkey_cb(struct mlx5_ib_dev * dev,struct mlx5_ib_mkey * mkey,struct mlx5_async_ctx * async_ctx,u32 * in,int inlen,u32 * out,int outlen,struct mlx5_async_work * context)116 mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
117 struct mlx5_ib_mkey *mkey,
118 struct mlx5_async_ctx *async_ctx,
119 u32 *in, int inlen, u32 *out, int outlen,
120 struct mlx5_async_work *context)
121 {
122 MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
123 assign_mkey_variant(dev, mkey, in);
124 return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen,
125 create_mkey_callback, context);
126 }
127
128 static int mr_cache_max_order(struct mlx5_ib_dev *dev);
129 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
130
umr_can_use_indirect_mkey(struct mlx5_ib_dev * dev)131 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
132 {
133 return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
134 }
135
destroy_mkey(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)136 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
137 {
138 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
139
140 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
141 }
142
create_mkey_callback(int status,struct mlx5_async_work * context)143 static void create_mkey_callback(int status, struct mlx5_async_work *context)
144 {
145 struct mlx5_ib_mr *mr =
146 container_of(context, struct mlx5_ib_mr, cb_work);
147 struct mlx5_cache_ent *ent = mr->cache_ent;
148 struct mlx5_ib_dev *dev = ent->dev;
149 unsigned long flags;
150
151 if (status) {
152 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
153 kfree(mr);
154 spin_lock_irqsave(&ent->lock, flags);
155 ent->pending--;
156 WRITE_ONCE(dev->fill_delay, 1);
157 spin_unlock_irqrestore(&ent->lock, flags);
158 mod_timer(&dev->delay_timer, jiffies + HZ);
159 return;
160 }
161
162 mr->mmkey.type = MLX5_MKEY_MR;
163 mr->mmkey.key |= mlx5_idx_to_mkey(
164 MLX5_GET(create_mkey_out, mr->out, mkey_index));
165 init_waitqueue_head(&mr->mmkey.wait);
166
167 WRITE_ONCE(dev->cache.last_add, jiffies);
168
169 spin_lock_irqsave(&ent->lock, flags);
170 list_add_tail(&mr->list, &ent->head);
171 ent->available_mrs++;
172 ent->total_mrs++;
173 /* If we are doing fill_to_high_water then keep going. */
174 queue_adjust_cache_locked(ent);
175 ent->pending--;
176 spin_unlock_irqrestore(&ent->lock, flags);
177 }
178
alloc_cache_mr(struct mlx5_cache_ent * ent,void * mkc)179 static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
180 {
181 struct mlx5_ib_mr *mr;
182
183 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
184 if (!mr)
185 return NULL;
186 mr->cache_ent = ent;
187
188 set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
189 MLX5_SET(mkc, mkc, free, 1);
190 MLX5_SET(mkc, mkc, umr_en, 1);
191 MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
192 MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7);
193
194 MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
195 MLX5_SET(mkc, mkc, log_page_size, ent->page);
196 return mr;
197 }
198
199 /* Asynchronously schedule new MRs to be populated in the cache. */
add_keys(struct mlx5_cache_ent * ent,unsigned int num)200 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
201 {
202 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
203 struct mlx5_ib_mr *mr;
204 void *mkc;
205 u32 *in;
206 int err = 0;
207 int i;
208
209 in = kzalloc(inlen, GFP_KERNEL);
210 if (!in)
211 return -ENOMEM;
212
213 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
214 for (i = 0; i < num; i++) {
215 mr = alloc_cache_mr(ent, mkc);
216 if (!mr) {
217 err = -ENOMEM;
218 break;
219 }
220 spin_lock_irq(&ent->lock);
221 if (ent->pending >= MAX_PENDING_REG_MR) {
222 err = -EAGAIN;
223 spin_unlock_irq(&ent->lock);
224 kfree(mr);
225 break;
226 }
227 ent->pending++;
228 spin_unlock_irq(&ent->lock);
229 err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
230 &ent->dev->async_ctx, in, inlen,
231 mr->out, sizeof(mr->out),
232 &mr->cb_work);
233 if (err) {
234 spin_lock_irq(&ent->lock);
235 ent->pending--;
236 spin_unlock_irq(&ent->lock);
237 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
238 kfree(mr);
239 break;
240 }
241 }
242
243 kfree(in);
244 return err;
245 }
246
247 /* Synchronously create a MR in the cache */
create_cache_mr(struct mlx5_cache_ent * ent)248 static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent)
249 {
250 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
251 struct mlx5_ib_mr *mr;
252 void *mkc;
253 u32 *in;
254 int err;
255
256 in = kzalloc(inlen, GFP_KERNEL);
257 if (!in)
258 return ERR_PTR(-ENOMEM);
259 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
260
261 mr = alloc_cache_mr(ent, mkc);
262 if (!mr) {
263 err = -ENOMEM;
264 goto free_in;
265 }
266
267 err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey.key, in, inlen);
268 if (err)
269 goto free_mr;
270
271 init_waitqueue_head(&mr->mmkey.wait);
272 mr->mmkey.type = MLX5_MKEY_MR;
273 WRITE_ONCE(ent->dev->cache.last_add, jiffies);
274 spin_lock_irq(&ent->lock);
275 ent->total_mrs++;
276 spin_unlock_irq(&ent->lock);
277 kfree(in);
278 return mr;
279 free_mr:
280 kfree(mr);
281 free_in:
282 kfree(in);
283 return ERR_PTR(err);
284 }
285
remove_cache_mr_locked(struct mlx5_cache_ent * ent)286 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
287 {
288 struct mlx5_ib_mr *mr;
289
290 lockdep_assert_held(&ent->lock);
291 if (list_empty(&ent->head))
292 return;
293 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
294 list_del(&mr->list);
295 ent->available_mrs--;
296 ent->total_mrs--;
297 spin_unlock_irq(&ent->lock);
298 mlx5_core_destroy_mkey(ent->dev->mdev, mr->mmkey.key);
299 kfree(mr);
300 spin_lock_irq(&ent->lock);
301 }
302
resize_available_mrs(struct mlx5_cache_ent * ent,unsigned int target,bool limit_fill)303 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
304 bool limit_fill)
305 {
306 int err;
307
308 lockdep_assert_held(&ent->lock);
309
310 while (true) {
311 if (limit_fill)
312 target = ent->limit * 2;
313 if (target == ent->available_mrs + ent->pending)
314 return 0;
315 if (target > ent->available_mrs + ent->pending) {
316 u32 todo = target - (ent->available_mrs + ent->pending);
317
318 spin_unlock_irq(&ent->lock);
319 err = add_keys(ent, todo);
320 if (err == -EAGAIN)
321 usleep_range(3000, 5000);
322 spin_lock_irq(&ent->lock);
323 if (err) {
324 if (err != -EAGAIN)
325 return err;
326 } else
327 return 0;
328 } else {
329 remove_cache_mr_locked(ent);
330 }
331 }
332 }
333
size_write(struct file * filp,const char __user * buf,size_t count,loff_t * pos)334 static ssize_t size_write(struct file *filp, const char __user *buf,
335 size_t count, loff_t *pos)
336 {
337 struct mlx5_cache_ent *ent = filp->private_data;
338 u32 target;
339 int err;
340
341 err = kstrtou32_from_user(buf, count, 0, &target);
342 if (err)
343 return err;
344
345 /*
346 * Target is the new value of total_mrs the user requests, however we
347 * cannot free MRs that are in use. Compute the target value for
348 * available_mrs.
349 */
350 spin_lock_irq(&ent->lock);
351 if (target < ent->total_mrs - ent->available_mrs) {
352 err = -EINVAL;
353 goto err_unlock;
354 }
355 target = target - (ent->total_mrs - ent->available_mrs);
356 if (target < ent->limit || target > ent->limit*2) {
357 err = -EINVAL;
358 goto err_unlock;
359 }
360 err = resize_available_mrs(ent, target, false);
361 if (err)
362 goto err_unlock;
363 spin_unlock_irq(&ent->lock);
364
365 return count;
366
367 err_unlock:
368 spin_unlock_irq(&ent->lock);
369 return err;
370 }
371
size_read(struct file * filp,char __user * buf,size_t count,loff_t * pos)372 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
373 loff_t *pos)
374 {
375 struct mlx5_cache_ent *ent = filp->private_data;
376 char lbuf[20];
377 int err;
378
379 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs);
380 if (err < 0)
381 return err;
382
383 return simple_read_from_buffer(buf, count, pos, lbuf, err);
384 }
385
386 static const struct file_operations size_fops = {
387 .owner = THIS_MODULE,
388 .open = simple_open,
389 .write = size_write,
390 .read = size_read,
391 };
392
limit_write(struct file * filp,const char __user * buf,size_t count,loff_t * pos)393 static ssize_t limit_write(struct file *filp, const char __user *buf,
394 size_t count, loff_t *pos)
395 {
396 struct mlx5_cache_ent *ent = filp->private_data;
397 u32 var;
398 int err;
399
400 err = kstrtou32_from_user(buf, count, 0, &var);
401 if (err)
402 return err;
403
404 /*
405 * Upon set we immediately fill the cache to high water mark implied by
406 * the limit.
407 */
408 spin_lock_irq(&ent->lock);
409 ent->limit = var;
410 err = resize_available_mrs(ent, 0, true);
411 spin_unlock_irq(&ent->lock);
412 if (err)
413 return err;
414 return count;
415 }
416
limit_read(struct file * filp,char __user * buf,size_t count,loff_t * pos)417 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
418 loff_t *pos)
419 {
420 struct mlx5_cache_ent *ent = filp->private_data;
421 char lbuf[20];
422 int err;
423
424 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
425 if (err < 0)
426 return err;
427
428 return simple_read_from_buffer(buf, count, pos, lbuf, err);
429 }
430
431 static const struct file_operations limit_fops = {
432 .owner = THIS_MODULE,
433 .open = simple_open,
434 .write = limit_write,
435 .read = limit_read,
436 };
437
someone_adding(struct mlx5_mr_cache * cache)438 static bool someone_adding(struct mlx5_mr_cache *cache)
439 {
440 unsigned int i;
441
442 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
443 struct mlx5_cache_ent *ent = &cache->ent[i];
444 bool ret;
445
446 spin_lock_irq(&ent->lock);
447 ret = ent->available_mrs < ent->limit;
448 spin_unlock_irq(&ent->lock);
449 if (ret)
450 return true;
451 }
452 return false;
453 }
454
455 /*
456 * Check if the bucket is outside the high/low water mark and schedule an async
457 * update. The cache refill has hysteresis, once the low water mark is hit it is
458 * refilled up to the high mark.
459 */
queue_adjust_cache_locked(struct mlx5_cache_ent * ent)460 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
461 {
462 lockdep_assert_held(&ent->lock);
463
464 if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
465 return;
466 if (ent->available_mrs < ent->limit) {
467 ent->fill_to_high_water = true;
468 queue_work(ent->dev->cache.wq, &ent->work);
469 } else if (ent->fill_to_high_water &&
470 ent->available_mrs + ent->pending < 2 * ent->limit) {
471 /*
472 * Once we start populating due to hitting a low water mark
473 * continue until we pass the high water mark.
474 */
475 queue_work(ent->dev->cache.wq, &ent->work);
476 } else if (ent->available_mrs == 2 * ent->limit) {
477 ent->fill_to_high_water = false;
478 } else if (ent->available_mrs > 2 * ent->limit) {
479 /* Queue deletion of excess entries */
480 ent->fill_to_high_water = false;
481 if (ent->pending)
482 queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
483 msecs_to_jiffies(1000));
484 else
485 queue_work(ent->dev->cache.wq, &ent->work);
486 }
487 }
488
__cache_work_func(struct mlx5_cache_ent * ent)489 static void __cache_work_func(struct mlx5_cache_ent *ent)
490 {
491 struct mlx5_ib_dev *dev = ent->dev;
492 struct mlx5_mr_cache *cache = &dev->cache;
493 int err;
494
495 spin_lock_irq(&ent->lock);
496 if (ent->disabled)
497 goto out;
498
499 if (ent->fill_to_high_water &&
500 ent->available_mrs + ent->pending < 2 * ent->limit &&
501 !READ_ONCE(dev->fill_delay)) {
502 spin_unlock_irq(&ent->lock);
503 err = add_keys(ent, 1);
504 spin_lock_irq(&ent->lock);
505 if (ent->disabled)
506 goto out;
507 if (err) {
508 /*
509 * EAGAIN only happens if pending is positive, so we
510 * will be rescheduled from reg_mr_callback(). The only
511 * failure path here is ENOMEM.
512 */
513 if (err != -EAGAIN) {
514 mlx5_ib_warn(
515 dev,
516 "command failed order %d, err %d\n",
517 ent->order, err);
518 queue_delayed_work(cache->wq, &ent->dwork,
519 msecs_to_jiffies(1000));
520 }
521 }
522 } else if (ent->available_mrs > 2 * ent->limit) {
523 bool need_delay;
524
525 /*
526 * The remove_cache_mr() logic is performed as garbage
527 * collection task. Such task is intended to be run when no
528 * other active processes are running.
529 *
530 * The need_resched() will return TRUE if there are user tasks
531 * to be activated in near future.
532 *
533 * In such case, we don't execute remove_cache_mr() and postpone
534 * the garbage collection work to try to run in next cycle, in
535 * order to free CPU resources to other tasks.
536 */
537 spin_unlock_irq(&ent->lock);
538 need_delay = need_resched() || someone_adding(cache) ||
539 !time_after(jiffies,
540 READ_ONCE(cache->last_add) + 300 * HZ);
541 spin_lock_irq(&ent->lock);
542 if (ent->disabled)
543 goto out;
544 if (need_delay)
545 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
546 remove_cache_mr_locked(ent);
547 queue_adjust_cache_locked(ent);
548 }
549 out:
550 spin_unlock_irq(&ent->lock);
551 }
552
delayed_cache_work_func(struct work_struct * work)553 static void delayed_cache_work_func(struct work_struct *work)
554 {
555 struct mlx5_cache_ent *ent;
556
557 ent = container_of(work, struct mlx5_cache_ent, dwork.work);
558 __cache_work_func(ent);
559 }
560
cache_work_func(struct work_struct * work)561 static void cache_work_func(struct work_struct *work)
562 {
563 struct mlx5_cache_ent *ent;
564
565 ent = container_of(work, struct mlx5_cache_ent, work);
566 __cache_work_func(ent);
567 }
568
569 /* Allocate a special entry from the cache */
mlx5_mr_cache_alloc(struct mlx5_ib_dev * dev,unsigned int entry,int access_flags)570 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
571 unsigned int entry, int access_flags)
572 {
573 struct mlx5_mr_cache *cache = &dev->cache;
574 struct mlx5_cache_ent *ent;
575 struct mlx5_ib_mr *mr;
576
577 if (WARN_ON(entry <= MR_CACHE_LAST_STD_ENTRY ||
578 entry >= ARRAY_SIZE(cache->ent)))
579 return ERR_PTR(-EINVAL);
580
581 /* Matches access in alloc_cache_mr() */
582 if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
583 return ERR_PTR(-EOPNOTSUPP);
584
585 ent = &cache->ent[entry];
586 spin_lock_irq(&ent->lock);
587 if (list_empty(&ent->head)) {
588 spin_unlock_irq(&ent->lock);
589 mr = create_cache_mr(ent);
590 if (IS_ERR(mr))
591 return mr;
592 } else {
593 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
594 list_del(&mr->list);
595 ent->available_mrs--;
596 queue_adjust_cache_locked(ent);
597 spin_unlock_irq(&ent->lock);
598
599 mlx5_clear_mr(mr);
600 }
601 mr->access_flags = access_flags;
602 return mr;
603 }
604
605 /* Return a MR already available in the cache */
get_cache_mr(struct mlx5_cache_ent * req_ent)606 static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent)
607 {
608 struct mlx5_ib_mr *mr = NULL;
609 struct mlx5_cache_ent *ent = req_ent;
610
611 spin_lock_irq(&ent->lock);
612 if (!list_empty(&ent->head)) {
613 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
614 list_del(&mr->list);
615 ent->available_mrs--;
616 queue_adjust_cache_locked(ent);
617 spin_unlock_irq(&ent->lock);
618 mlx5_clear_mr(mr);
619 return mr;
620 }
621 queue_adjust_cache_locked(ent);
622 spin_unlock_irq(&ent->lock);
623 req_ent->miss++;
624 return NULL;
625 }
626
mlx5_mr_cache_free(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)627 static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
628 {
629 struct mlx5_cache_ent *ent = mr->cache_ent;
630
631 spin_lock_irq(&ent->lock);
632 list_add_tail(&mr->list, &ent->head);
633 ent->available_mrs++;
634 queue_adjust_cache_locked(ent);
635 spin_unlock_irq(&ent->lock);
636 }
637
clean_keys(struct mlx5_ib_dev * dev,int c)638 static void clean_keys(struct mlx5_ib_dev *dev, int c)
639 {
640 struct mlx5_mr_cache *cache = &dev->cache;
641 struct mlx5_cache_ent *ent = &cache->ent[c];
642 struct mlx5_ib_mr *tmp_mr;
643 struct mlx5_ib_mr *mr;
644 LIST_HEAD(del_list);
645
646 cancel_delayed_work(&ent->dwork);
647 while (1) {
648 spin_lock_irq(&ent->lock);
649 if (list_empty(&ent->head)) {
650 spin_unlock_irq(&ent->lock);
651 break;
652 }
653 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
654 list_move(&mr->list, &del_list);
655 ent->available_mrs--;
656 ent->total_mrs--;
657 spin_unlock_irq(&ent->lock);
658 mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
659 }
660
661 list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
662 list_del(&mr->list);
663 kfree(mr);
664 }
665 }
666
mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev * dev)667 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
668 {
669 if (!mlx5_debugfs_root || dev->is_rep)
670 return;
671
672 debugfs_remove_recursive(dev->cache.root);
673 dev->cache.root = NULL;
674 }
675
mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev * dev)676 static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
677 {
678 struct mlx5_mr_cache *cache = &dev->cache;
679 struct mlx5_cache_ent *ent;
680 struct dentry *dir;
681 int i;
682
683 if (!mlx5_debugfs_root || dev->is_rep)
684 return;
685
686 cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
687
688 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
689 ent = &cache->ent[i];
690 sprintf(ent->name, "%d", ent->order);
691 dir = debugfs_create_dir(ent->name, cache->root);
692 debugfs_create_file("size", 0600, dir, ent, &size_fops);
693 debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
694 debugfs_create_u32("cur", 0400, dir, &ent->available_mrs);
695 debugfs_create_u32("miss", 0600, dir, &ent->miss);
696 }
697 }
698
delay_time_func(struct timer_list * t)699 static void delay_time_func(struct timer_list *t)
700 {
701 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
702
703 WRITE_ONCE(dev->fill_delay, 0);
704 }
705
mlx5_mr_cache_init(struct mlx5_ib_dev * dev)706 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
707 {
708 struct mlx5_mr_cache *cache = &dev->cache;
709 struct mlx5_cache_ent *ent;
710 int i;
711
712 mutex_init(&dev->slow_path_mutex);
713 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
714 if (!cache->wq) {
715 mlx5_ib_warn(dev, "failed to create work queue\n");
716 return -ENOMEM;
717 }
718
719 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
720 timer_setup(&dev->delay_timer, delay_time_func, 0);
721 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
722 ent = &cache->ent[i];
723 INIT_LIST_HEAD(&ent->head);
724 spin_lock_init(&ent->lock);
725 ent->order = i + 2;
726 ent->dev = dev;
727 ent->limit = 0;
728
729 INIT_WORK(&ent->work, cache_work_func);
730 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
731
732 if (i > MR_CACHE_LAST_STD_ENTRY) {
733 mlx5_odp_init_mr_cache_entry(ent);
734 continue;
735 }
736
737 if (ent->order > mr_cache_max_order(dev))
738 continue;
739
740 ent->page = PAGE_SHIFT;
741 ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) /
742 MLX5_IB_UMR_OCTOWORD;
743 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
744 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
745 !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
746 mlx5_ib_can_load_pas_with_umr(dev, 0))
747 ent->limit = dev->mdev->profile.mr_cache[i].limit;
748 else
749 ent->limit = 0;
750 spin_lock_irq(&ent->lock);
751 queue_adjust_cache_locked(ent);
752 spin_unlock_irq(&ent->lock);
753 }
754
755 mlx5_mr_cache_debugfs_init(dev);
756
757 return 0;
758 }
759
mlx5_mr_cache_cleanup(struct mlx5_ib_dev * dev)760 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
761 {
762 unsigned int i;
763
764 if (!dev->cache.wq)
765 return 0;
766
767 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
768 struct mlx5_cache_ent *ent = &dev->cache.ent[i];
769
770 spin_lock_irq(&ent->lock);
771 ent->disabled = true;
772 spin_unlock_irq(&ent->lock);
773 cancel_work_sync(&ent->work);
774 cancel_delayed_work_sync(&ent->dwork);
775 }
776
777 mlx5_mr_cache_debugfs_cleanup(dev);
778 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
779
780 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
781 clean_keys(dev, i);
782
783 destroy_workqueue(dev->cache.wq);
784 del_timer_sync(&dev->delay_timer);
785
786 return 0;
787 }
788
mlx5_ib_get_dma_mr(struct ib_pd * pd,int acc)789 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
790 {
791 struct mlx5_ib_dev *dev = to_mdev(pd->device);
792 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
793 struct mlx5_ib_mr *mr;
794 void *mkc;
795 u32 *in;
796 int err;
797
798 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
799 if (!mr)
800 return ERR_PTR(-ENOMEM);
801
802 in = kzalloc(inlen, GFP_KERNEL);
803 if (!in) {
804 err = -ENOMEM;
805 goto err_free;
806 }
807
808 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
809
810 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
811 MLX5_SET(mkc, mkc, length64, 1);
812 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0,
813 pd);
814
815 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
816 if (err)
817 goto err_in;
818
819 kfree(in);
820 mr->mmkey.type = MLX5_MKEY_MR;
821 mr->ibmr.lkey = mr->mmkey.key;
822 mr->ibmr.rkey = mr->mmkey.key;
823 mr->umem = NULL;
824
825 return &mr->ibmr;
826
827 err_in:
828 kfree(in);
829
830 err_free:
831 kfree(mr);
832
833 return ERR_PTR(err);
834 }
835
get_octo_len(u64 addr,u64 len,int page_shift)836 static int get_octo_len(u64 addr, u64 len, int page_shift)
837 {
838 u64 page_size = 1ULL << page_shift;
839 u64 offset;
840 int npages;
841
842 offset = addr & (page_size - 1);
843 npages = ALIGN(len + offset, page_size) >> page_shift;
844 return (npages + 1) / 2;
845 }
846
mr_cache_max_order(struct mlx5_ib_dev * dev)847 static int mr_cache_max_order(struct mlx5_ib_dev *dev)
848 {
849 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
850 return MR_CACHE_LAST_STD_ENTRY + 2;
851 return MLX5_MAX_UMR_SHIFT;
852 }
853
mlx5_ib_umr_done(struct ib_cq * cq,struct ib_wc * wc)854 static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc)
855 {
856 struct mlx5_ib_umr_context *context =
857 container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
858
859 context->status = wc->status;
860 complete(&context->done);
861 }
862
mlx5_ib_init_umr_context(struct mlx5_ib_umr_context * context)863 static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
864 {
865 context->cqe.done = mlx5_ib_umr_done;
866 context->status = -1;
867 init_completion(&context->done);
868 }
869
mlx5_ib_post_send_wait(struct mlx5_ib_dev * dev,struct mlx5_umr_wr * umrwr)870 static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev,
871 struct mlx5_umr_wr *umrwr)
872 {
873 struct umr_common *umrc = &dev->umrc;
874 const struct ib_send_wr *bad;
875 int err;
876 struct mlx5_ib_umr_context umr_context;
877
878 mlx5_ib_init_umr_context(&umr_context);
879 umrwr->wr.wr_cqe = &umr_context.cqe;
880
881 down(&umrc->sem);
882 err = ib_post_send(umrc->qp, &umrwr->wr, &bad);
883 if (err) {
884 mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
885 } else {
886 wait_for_completion(&umr_context.done);
887 if (umr_context.status != IB_WC_SUCCESS) {
888 mlx5_ib_warn(dev, "reg umr failed (%u)\n",
889 umr_context.status);
890 err = -EFAULT;
891 }
892 }
893 up(&umrc->sem);
894 return err;
895 }
896
mr_cache_ent_from_order(struct mlx5_ib_dev * dev,unsigned int order)897 static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
898 unsigned int order)
899 {
900 struct mlx5_mr_cache *cache = &dev->cache;
901
902 if (order < cache->ent[0].order)
903 return &cache->ent[0];
904 order = order - cache->ent[0].order;
905 if (order > MR_CACHE_LAST_STD_ENTRY)
906 return NULL;
907 return &cache->ent[order];
908 }
909
set_mr_fields(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr,u64 length,int access_flags,u64 iova)910 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
911 u64 length, int access_flags, u64 iova)
912 {
913 mr->ibmr.lkey = mr->mmkey.key;
914 mr->ibmr.rkey = mr->mmkey.key;
915 mr->ibmr.length = length;
916 mr->ibmr.device = &dev->ib_dev;
917 mr->ibmr.iova = iova;
918 mr->access_flags = access_flags;
919 }
920
mlx5_umem_dmabuf_default_pgsz(struct ib_umem * umem,u64 iova)921 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
922 u64 iova)
923 {
924 /*
925 * The alignment of iova has already been checked upon entering
926 * UVERBS_METHOD_REG_DMABUF_MR
927 */
928 umem->iova = iova;
929 return PAGE_SIZE;
930 }
931
alloc_cacheable_mr(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags)932 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
933 struct ib_umem *umem, u64 iova,
934 int access_flags)
935 {
936 struct mlx5_ib_dev *dev = to_mdev(pd->device);
937 struct mlx5_cache_ent *ent;
938 struct mlx5_ib_mr *mr;
939 unsigned int page_size;
940
941 if (umem->is_dmabuf)
942 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
943 else
944 page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size,
945 0, iova);
946 if (WARN_ON(!page_size))
947 return ERR_PTR(-EINVAL);
948 ent = mr_cache_ent_from_order(
949 dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
950 /*
951 * Matches access in alloc_cache_mr(). If the MR can't come from the
952 * cache then synchronously create an uncached one.
953 */
954 if (!ent || ent->limit == 0 ||
955 !mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) {
956 mutex_lock(&dev->slow_path_mutex);
957 mr = reg_create(pd, umem, iova, access_flags, page_size, false);
958 mutex_unlock(&dev->slow_path_mutex);
959 return mr;
960 }
961
962 mr = get_cache_mr(ent);
963 if (!mr) {
964 mr = create_cache_mr(ent);
965 /*
966 * The above already tried to do the same stuff as reg_create(),
967 * no reason to try it again.
968 */
969 if (IS_ERR(mr))
970 return mr;
971 }
972
973 mr->ibmr.pd = pd;
974 mr->umem = umem;
975 mr->page_shift = order_base_2(page_size);
976 set_mr_fields(dev, mr, umem->length, access_flags, iova);
977
978 return mr;
979 }
980
981 #define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \
982 MLX5_UMR_MTT_ALIGNMENT)
983 #define MLX5_SPARE_UMR_CHUNK 0x10000
984
985 /*
986 * Allocate a temporary buffer to hold the per-page information to transfer to
987 * HW. For efficiency this should be as large as it can be, but buffer
988 * allocation failure is not allowed, so try smaller sizes.
989 */
mlx5_ib_alloc_xlt(size_t * nents,size_t ent_size,gfp_t gfp_mask)990 static void *mlx5_ib_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
991 {
992 const size_t xlt_chunk_align =
993 MLX5_UMR_MTT_ALIGNMENT / ent_size;
994 size_t size;
995 void *res = NULL;
996
997 static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0);
998
999 /*
1000 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
1001 * allocation can't trigger any kind of reclaim.
1002 */
1003 might_sleep();
1004
1005 gfp_mask |= __GFP_ZERO | __GFP_NORETRY;
1006
1007 /*
1008 * If the system already has a suitable high order page then just use
1009 * that, but don't try hard to create one. This max is about 1M, so a
1010 * free x86 huge page will satisfy it.
1011 */
1012 size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align),
1013 MLX5_MAX_UMR_CHUNK);
1014 *nents = size / ent_size;
1015 res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
1016 get_order(size));
1017 if (res)
1018 return res;
1019
1020 if (size > MLX5_SPARE_UMR_CHUNK) {
1021 size = MLX5_SPARE_UMR_CHUNK;
1022 *nents = size / ent_size;
1023 res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
1024 get_order(size));
1025 if (res)
1026 return res;
1027 }
1028
1029 *nents = PAGE_SIZE / ent_size;
1030 res = (void *)__get_free_page(gfp_mask);
1031 if (res)
1032 return res;
1033
1034 mutex_lock(&xlt_emergency_page_mutex);
1035 memset(xlt_emergency_page, 0, PAGE_SIZE);
1036 return xlt_emergency_page;
1037 }
1038
mlx5_ib_free_xlt(void * xlt,size_t length)1039 static void mlx5_ib_free_xlt(void *xlt, size_t length)
1040 {
1041 if (xlt == xlt_emergency_page) {
1042 mutex_unlock(&xlt_emergency_page_mutex);
1043 return;
1044 }
1045
1046 free_pages((unsigned long)xlt, get_order(length));
1047 }
1048
1049 /*
1050 * Create a MLX5_IB_SEND_UMR_UPDATE_XLT work request and XLT buffer ready for
1051 * submission.
1052 */
mlx5_ib_create_xlt_wr(struct mlx5_ib_mr * mr,struct mlx5_umr_wr * wr,struct ib_sge * sg,size_t nents,size_t ent_size,unsigned int flags)1053 static void *mlx5_ib_create_xlt_wr(struct mlx5_ib_mr *mr,
1054 struct mlx5_umr_wr *wr, struct ib_sge *sg,
1055 size_t nents, size_t ent_size,
1056 unsigned int flags)
1057 {
1058 struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1059 struct device *ddev = &dev->mdev->pdev->dev;
1060 dma_addr_t dma;
1061 void *xlt;
1062
1063 xlt = mlx5_ib_alloc_xlt(&nents, ent_size,
1064 flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC :
1065 GFP_KERNEL);
1066 sg->length = nents * ent_size;
1067 dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE);
1068 if (dma_mapping_error(ddev, dma)) {
1069 mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
1070 mlx5_ib_free_xlt(xlt, sg->length);
1071 return NULL;
1072 }
1073 sg->addr = dma;
1074 sg->lkey = dev->umrc.pd->local_dma_lkey;
1075
1076 memset(wr, 0, sizeof(*wr));
1077 wr->wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT;
1078 if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
1079 wr->wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE;
1080 wr->wr.sg_list = sg;
1081 wr->wr.num_sge = 1;
1082 wr->wr.opcode = MLX5_IB_WR_UMR;
1083 wr->pd = mr->ibmr.pd;
1084 wr->mkey = mr->mmkey.key;
1085 wr->length = mr->ibmr.length;
1086 wr->virt_addr = mr->ibmr.iova;
1087 wr->access_flags = mr->access_flags;
1088 wr->page_shift = mr->page_shift;
1089 wr->xlt_size = sg->length;
1090 return xlt;
1091 }
1092
mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev * dev,void * xlt,struct ib_sge * sg)1093 static void mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
1094 struct ib_sge *sg)
1095 {
1096 struct device *ddev = &dev->mdev->pdev->dev;
1097
1098 dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE);
1099 mlx5_ib_free_xlt(xlt, sg->length);
1100 }
1101
xlt_wr_final_send_flags(unsigned int flags)1102 static unsigned int xlt_wr_final_send_flags(unsigned int flags)
1103 {
1104 unsigned int res = 0;
1105
1106 if (flags & MLX5_IB_UPD_XLT_ENABLE)
1107 res |= MLX5_IB_SEND_UMR_ENABLE_MR |
1108 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS |
1109 MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1110 if (flags & MLX5_IB_UPD_XLT_PD || flags & MLX5_IB_UPD_XLT_ACCESS)
1111 res |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1112 if (flags & MLX5_IB_UPD_XLT_ADDR)
1113 res |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1114 return res;
1115 }
1116
mlx5_ib_update_xlt(struct mlx5_ib_mr * mr,u64 idx,int npages,int page_shift,int flags)1117 int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
1118 int page_shift, int flags)
1119 {
1120 struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1121 struct device *ddev = &dev->mdev->pdev->dev;
1122 void *xlt;
1123 struct mlx5_umr_wr wr;
1124 struct ib_sge sg;
1125 int err = 0;
1126 int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
1127 ? sizeof(struct mlx5_klm)
1128 : sizeof(struct mlx5_mtt);
1129 const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
1130 const int page_mask = page_align - 1;
1131 size_t pages_mapped = 0;
1132 size_t pages_to_map = 0;
1133 size_t pages_iter;
1134 size_t size_to_map = 0;
1135 size_t orig_sg_length;
1136
1137 if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
1138 !umr_can_use_indirect_mkey(dev))
1139 return -EPERM;
1140
1141 if (WARN_ON(!mr->umem->is_odp))
1142 return -EINVAL;
1143
1144 /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
1145 * so we need to align the offset and length accordingly
1146 */
1147 if (idx & page_mask) {
1148 npages += idx & page_mask;
1149 idx &= ~page_mask;
1150 }
1151 pages_to_map = ALIGN(npages, page_align);
1152
1153 xlt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, npages, desc_size, flags);
1154 if (!xlt)
1155 return -ENOMEM;
1156 pages_iter = sg.length / desc_size;
1157 orig_sg_length = sg.length;
1158
1159 if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
1160 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
1161 size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
1162
1163 pages_to_map = min_t(size_t, pages_to_map, max_pages);
1164 }
1165
1166 wr.page_shift = page_shift;
1167
1168 for (pages_mapped = 0;
1169 pages_mapped < pages_to_map && !err;
1170 pages_mapped += pages_iter, idx += pages_iter) {
1171 npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
1172 size_to_map = npages * desc_size;
1173 dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
1174 DMA_TO_DEVICE);
1175 mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
1176 dma_sync_single_for_device(ddev, sg.addr, sg.length,
1177 DMA_TO_DEVICE);
1178
1179 sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
1180
1181 if (pages_mapped + pages_iter >= pages_to_map)
1182 wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
1183
1184 wr.offset = idx * desc_size;
1185 wr.xlt_size = sg.length;
1186
1187 err = mlx5_ib_post_send_wait(dev, &wr);
1188 }
1189 sg.length = orig_sg_length;
1190 mlx5_ib_unmap_free_xlt(dev, xlt, &sg);
1191 return err;
1192 }
1193
1194 /*
1195 * Send the DMA list to the HW for a normal MR using UMR.
1196 * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
1197 * flag may be used.
1198 */
mlx5_ib_update_mr_pas(struct mlx5_ib_mr * mr,unsigned int flags)1199 int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
1200 {
1201 struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1202 struct device *ddev = &dev->mdev->pdev->dev;
1203 struct ib_block_iter biter;
1204 struct mlx5_mtt *cur_mtt;
1205 struct mlx5_umr_wr wr;
1206 size_t orig_sg_length;
1207 struct mlx5_mtt *mtt;
1208 size_t final_size;
1209 struct ib_sge sg;
1210 int err = 0;
1211
1212 if (WARN_ON(mr->umem->is_odp))
1213 return -EINVAL;
1214
1215 mtt = mlx5_ib_create_xlt_wr(mr, &wr, &sg,
1216 ib_umem_num_dma_blocks(mr->umem,
1217 1 << mr->page_shift),
1218 sizeof(*mtt), flags);
1219 if (!mtt)
1220 return -ENOMEM;
1221 orig_sg_length = sg.length;
1222
1223 cur_mtt = mtt;
1224 rdma_for_each_block (mr->umem->sgt_append.sgt.sgl, &biter,
1225 mr->umem->sgt_append.sgt.nents,
1226 BIT(mr->page_shift)) {
1227 if (cur_mtt == (void *)mtt + sg.length) {
1228 dma_sync_single_for_device(ddev, sg.addr, sg.length,
1229 DMA_TO_DEVICE);
1230 err = mlx5_ib_post_send_wait(dev, &wr);
1231 if (err)
1232 goto err;
1233 dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
1234 DMA_TO_DEVICE);
1235 wr.offset += sg.length;
1236 cur_mtt = mtt;
1237 }
1238
1239 cur_mtt->ptag =
1240 cpu_to_be64(rdma_block_iter_dma_address(&biter) |
1241 MLX5_IB_MTT_PRESENT);
1242
1243 if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
1244 cur_mtt->ptag = 0;
1245
1246 cur_mtt++;
1247 }
1248
1249 final_size = (void *)cur_mtt - (void *)mtt;
1250 sg.length = ALIGN(final_size, MLX5_UMR_MTT_ALIGNMENT);
1251 memset(cur_mtt, 0, sg.length - final_size);
1252 wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
1253 wr.xlt_size = sg.length;
1254
1255 dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
1256 err = mlx5_ib_post_send_wait(dev, &wr);
1257
1258 err:
1259 sg.length = orig_sg_length;
1260 mlx5_ib_unmap_free_xlt(dev, mtt, &sg);
1261 return err;
1262 }
1263
1264 /*
1265 * If ibmr is NULL it will be allocated by reg_create.
1266 * Else, the given ibmr will be used.
1267 */
reg_create(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags,unsigned int page_size,bool populate)1268 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
1269 u64 iova, int access_flags,
1270 unsigned int page_size, bool populate)
1271 {
1272 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1273 struct mlx5_ib_mr *mr;
1274 __be64 *pas;
1275 void *mkc;
1276 int inlen;
1277 u32 *in;
1278 int err;
1279 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
1280
1281 if (!page_size)
1282 return ERR_PTR(-EINVAL);
1283 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1284 if (!mr)
1285 return ERR_PTR(-ENOMEM);
1286
1287 mr->ibmr.pd = pd;
1288 mr->access_flags = access_flags;
1289 mr->page_shift = order_base_2(page_size);
1290
1291 inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1292 if (populate)
1293 inlen += sizeof(*pas) *
1294 roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
1295 in = kvzalloc(inlen, GFP_KERNEL);
1296 if (!in) {
1297 err = -ENOMEM;
1298 goto err_1;
1299 }
1300 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1301 if (populate) {
1302 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) {
1303 err = -EINVAL;
1304 goto err_2;
1305 }
1306 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
1307 pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1308 }
1309
1310 /* The pg_access bit allows setting the access flags
1311 * in the page list submitted with the command. */
1312 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1313
1314 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1315 set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
1316 populate ? pd : dev->umrc.pd);
1317 MLX5_SET(mkc, mkc, free, !populate);
1318 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
1319 MLX5_SET(mkc, mkc, umr_en, 1);
1320
1321 MLX5_SET64(mkc, mkc, len, umem->length);
1322 MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1323 MLX5_SET(mkc, mkc, translations_octword_size,
1324 get_octo_len(iova, umem->length, mr->page_shift));
1325 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
1326 if (populate) {
1327 MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1328 get_octo_len(iova, umem->length, mr->page_shift));
1329 }
1330
1331 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1332 if (err) {
1333 mlx5_ib_warn(dev, "create mkey failed\n");
1334 goto err_2;
1335 }
1336 mr->mmkey.type = MLX5_MKEY_MR;
1337 mr->umem = umem;
1338 set_mr_fields(dev, mr, umem->length, access_flags, iova);
1339 kvfree(in);
1340
1341 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1342
1343 return mr;
1344
1345 err_2:
1346 kvfree(in);
1347 err_1:
1348 kfree(mr);
1349 return ERR_PTR(err);
1350 }
1351
mlx5_ib_get_dm_mr(struct ib_pd * pd,u64 start_addr,u64 length,int acc,int mode)1352 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1353 u64 length, int acc, int mode)
1354 {
1355 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1356 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1357 struct mlx5_ib_mr *mr;
1358 void *mkc;
1359 u32 *in;
1360 int err;
1361
1362 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1363 if (!mr)
1364 return ERR_PTR(-ENOMEM);
1365
1366 in = kzalloc(inlen, GFP_KERNEL);
1367 if (!in) {
1368 err = -ENOMEM;
1369 goto err_free;
1370 }
1371
1372 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1373
1374 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1375 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1376 MLX5_SET64(mkc, mkc, len, length);
1377 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1378
1379 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1380 if (err)
1381 goto err_in;
1382
1383 kfree(in);
1384
1385 set_mr_fields(dev, mr, length, acc, start_addr);
1386
1387 return &mr->ibmr;
1388
1389 err_in:
1390 kfree(in);
1391
1392 err_free:
1393 kfree(mr);
1394
1395 return ERR_PTR(err);
1396 }
1397
mlx5_ib_advise_mr(struct ib_pd * pd,enum ib_uverbs_advise_mr_advice advice,u32 flags,struct ib_sge * sg_list,u32 num_sge,struct uverbs_attr_bundle * attrs)1398 int mlx5_ib_advise_mr(struct ib_pd *pd,
1399 enum ib_uverbs_advise_mr_advice advice,
1400 u32 flags,
1401 struct ib_sge *sg_list,
1402 u32 num_sge,
1403 struct uverbs_attr_bundle *attrs)
1404 {
1405 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1406 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1407 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1408 return -EOPNOTSUPP;
1409
1410 return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1411 sg_list, num_sge);
1412 }
1413
mlx5_ib_reg_dm_mr(struct ib_pd * pd,struct ib_dm * dm,struct ib_dm_mr_attr * attr,struct uverbs_attr_bundle * attrs)1414 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1415 struct ib_dm_mr_attr *attr,
1416 struct uverbs_attr_bundle *attrs)
1417 {
1418 struct mlx5_ib_dm *mdm = to_mdm(dm);
1419 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1420 u64 start_addr = mdm->dev_addr + attr->offset;
1421 int mode;
1422
1423 switch (mdm->type) {
1424 case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1425 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1426 return ERR_PTR(-EINVAL);
1427
1428 mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1429 start_addr -= pci_resource_start(dev->pdev, 0);
1430 break;
1431 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1432 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1433 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1434 return ERR_PTR(-EINVAL);
1435
1436 mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1437 break;
1438 default:
1439 return ERR_PTR(-EINVAL);
1440 }
1441
1442 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1443 attr->access_flags, mode);
1444 }
1445
create_real_mr(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags)1446 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
1447 u64 iova, int access_flags)
1448 {
1449 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1450 struct mlx5_ib_mr *mr = NULL;
1451 bool xlt_with_umr;
1452 int err;
1453
1454 xlt_with_umr = mlx5_ib_can_load_pas_with_umr(dev, umem->length);
1455 if (xlt_with_umr) {
1456 mr = alloc_cacheable_mr(pd, umem, iova, access_flags);
1457 } else {
1458 unsigned int page_size = mlx5_umem_find_best_pgsz(
1459 umem, mkc, log_page_size, 0, iova);
1460
1461 mutex_lock(&dev->slow_path_mutex);
1462 mr = reg_create(pd, umem, iova, access_flags, page_size, true);
1463 mutex_unlock(&dev->slow_path_mutex);
1464 }
1465 if (IS_ERR(mr)) {
1466 ib_umem_release(umem);
1467 return ERR_CAST(mr);
1468 }
1469
1470 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1471
1472 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1473
1474 if (xlt_with_umr) {
1475 /*
1476 * If the MR was created with reg_create then it will be
1477 * configured properly but left disabled. It is safe to go ahead
1478 * and configure it again via UMR while enabling it.
1479 */
1480 err = mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
1481 if (err) {
1482 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1483 return ERR_PTR(err);
1484 }
1485 }
1486 return &mr->ibmr;
1487 }
1488
create_user_odp_mr(struct ib_pd * pd,u64 start,u64 length,u64 iova,int access_flags,struct ib_udata * udata)1489 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
1490 u64 iova, int access_flags,
1491 struct ib_udata *udata)
1492 {
1493 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1494 struct ib_umem_odp *odp;
1495 struct mlx5_ib_mr *mr;
1496 int err;
1497
1498 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1499 return ERR_PTR(-EOPNOTSUPP);
1500
1501 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq);
1502 if (err)
1503 return ERR_PTR(err);
1504 if (!start && length == U64_MAX) {
1505 if (iova != 0)
1506 return ERR_PTR(-EINVAL);
1507 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1508 return ERR_PTR(-EINVAL);
1509
1510 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
1511 if (IS_ERR(mr))
1512 return ERR_CAST(mr);
1513 return &mr->ibmr;
1514 }
1515
1516 /* ODP requires xlt update via umr to work. */
1517 if (!mlx5_ib_can_load_pas_with_umr(dev, length))
1518 return ERR_PTR(-EINVAL);
1519
1520 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
1521 &mlx5_mn_ops);
1522 if (IS_ERR(odp))
1523 return ERR_CAST(odp);
1524
1525 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags);
1526 if (IS_ERR(mr)) {
1527 ib_umem_release(&odp->umem);
1528 return ERR_CAST(mr);
1529 }
1530 xa_init(&mr->implicit_children);
1531
1532 odp->private = mr;
1533 err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1534 if (err)
1535 goto err_dereg_mr;
1536
1537 err = mlx5_ib_init_odp_mr(mr);
1538 if (err)
1539 goto err_dereg_mr;
1540 return &mr->ibmr;
1541
1542 err_dereg_mr:
1543 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1544 return ERR_PTR(err);
1545 }
1546
mlx5_ib_reg_user_mr(struct ib_pd * pd,u64 start,u64 length,u64 iova,int access_flags,struct ib_udata * udata)1547 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1548 u64 iova, int access_flags,
1549 struct ib_udata *udata)
1550 {
1551 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1552 struct ib_umem *umem;
1553
1554 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1555 return ERR_PTR(-EOPNOTSUPP);
1556
1557 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1558 start, iova, length, access_flags);
1559
1560 if (access_flags & IB_ACCESS_ON_DEMAND)
1561 return create_user_odp_mr(pd, start, length, iova, access_flags,
1562 udata);
1563 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
1564 if (IS_ERR(umem))
1565 return ERR_CAST(umem);
1566 return create_real_mr(pd, umem, iova, access_flags);
1567 }
1568
mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment * attach)1569 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
1570 {
1571 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
1572 struct mlx5_ib_mr *mr = umem_dmabuf->private;
1573
1574 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
1575
1576 if (!umem_dmabuf->sgt)
1577 return;
1578
1579 mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
1580 ib_umem_dmabuf_unmap_pages(umem_dmabuf);
1581 }
1582
1583 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
1584 .allow_peer2peer = 1,
1585 .move_notify = mlx5_ib_dmabuf_invalidate_cb,
1586 };
1587
mlx5_ib_reg_user_mr_dmabuf(struct ib_pd * pd,u64 offset,u64 length,u64 virt_addr,int fd,int access_flags,struct ib_udata * udata)1588 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
1589 u64 length, u64 virt_addr,
1590 int fd, int access_flags,
1591 struct ib_udata *udata)
1592 {
1593 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1594 struct mlx5_ib_mr *mr = NULL;
1595 struct ib_umem_dmabuf *umem_dmabuf;
1596 int err;
1597
1598 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
1599 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1600 return ERR_PTR(-EOPNOTSUPP);
1601
1602 mlx5_ib_dbg(dev,
1603 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
1604 offset, virt_addr, length, fd, access_flags);
1605
1606 /* dmabuf requires xlt update via umr to work. */
1607 if (!mlx5_ib_can_load_pas_with_umr(dev, length))
1608 return ERR_PTR(-EINVAL);
1609
1610 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd,
1611 access_flags,
1612 &mlx5_ib_dmabuf_attach_ops);
1613 if (IS_ERR(umem_dmabuf)) {
1614 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
1615 PTR_ERR(umem_dmabuf));
1616 return ERR_CAST(umem_dmabuf);
1617 }
1618
1619 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
1620 access_flags);
1621 if (IS_ERR(mr)) {
1622 ib_umem_release(&umem_dmabuf->umem);
1623 return ERR_CAST(mr);
1624 }
1625
1626 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1627
1628 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
1629 umem_dmabuf->private = mr;
1630 err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1631 if (err)
1632 goto err_dereg_mr;
1633
1634 err = mlx5_ib_init_dmabuf_mr(mr);
1635 if (err)
1636 goto err_dereg_mr;
1637 return &mr->ibmr;
1638
1639 err_dereg_mr:
1640 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1641 return ERR_PTR(err);
1642 }
1643
1644 /**
1645 * revoke_mr - Fence all DMA on the MR
1646 * @mr: The MR to fence
1647 *
1648 * Upon return the NIC will not be doing any DMA to the pages under the MR,
1649 * and any DMA in progress will be completed. Failure of this function
1650 * indicates the HW has failed catastrophically.
1651 */
revoke_mr(struct mlx5_ib_mr * mr)1652 static int revoke_mr(struct mlx5_ib_mr *mr)
1653 {
1654 struct mlx5_umr_wr umrwr = {};
1655
1656 if (mr_to_mdev(mr)->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
1657 return 0;
1658
1659 umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
1660 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1661 umrwr.wr.opcode = MLX5_IB_WR_UMR;
1662 umrwr.pd = mr_to_mdev(mr)->umrc.pd;
1663 umrwr.mkey = mr->mmkey.key;
1664 umrwr.ignore_free_state = 1;
1665
1666 return mlx5_ib_post_send_wait(mr_to_mdev(mr), &umrwr);
1667 }
1668
1669 /*
1670 * True if the change in access flags can be done via UMR, only some access
1671 * flags can be updated.
1672 */
can_use_umr_rereg_access(struct mlx5_ib_dev * dev,unsigned int current_access_flags,unsigned int target_access_flags)1673 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
1674 unsigned int current_access_flags,
1675 unsigned int target_access_flags)
1676 {
1677 unsigned int diffs = current_access_flags ^ target_access_flags;
1678
1679 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
1680 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING))
1681 return false;
1682 return mlx5_ib_can_reconfig_with_umr(dev, current_access_flags,
1683 target_access_flags);
1684 }
1685
umr_rereg_pd_access(struct mlx5_ib_mr * mr,struct ib_pd * pd,int access_flags)1686 static int umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1687 int access_flags)
1688 {
1689 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1690 struct mlx5_umr_wr umrwr = {
1691 .wr = {
1692 .send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
1693 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS,
1694 .opcode = MLX5_IB_WR_UMR,
1695 },
1696 .mkey = mr->mmkey.key,
1697 .pd = pd,
1698 .access_flags = access_flags,
1699 };
1700 int err;
1701
1702 err = mlx5_ib_post_send_wait(dev, &umrwr);
1703 if (err)
1704 return err;
1705
1706 mr->access_flags = access_flags;
1707 return 0;
1708 }
1709
can_use_umr_rereg_pas(struct mlx5_ib_mr * mr,struct ib_umem * new_umem,int new_access_flags,u64 iova,unsigned long * page_size)1710 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
1711 struct ib_umem *new_umem,
1712 int new_access_flags, u64 iova,
1713 unsigned long *page_size)
1714 {
1715 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1716
1717 /* We only track the allocated sizes of MRs from the cache */
1718 if (!mr->cache_ent)
1719 return false;
1720 if (!mlx5_ib_can_load_pas_with_umr(dev, new_umem->length))
1721 return false;
1722
1723 *page_size =
1724 mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
1725 if (WARN_ON(!*page_size))
1726 return false;
1727 return (1ULL << mr->cache_ent->order) >=
1728 ib_umem_num_dma_blocks(new_umem, *page_size);
1729 }
1730
umr_rereg_pas(struct mlx5_ib_mr * mr,struct ib_pd * pd,int access_flags,int flags,struct ib_umem * new_umem,u64 iova,unsigned long page_size)1731 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1732 int access_flags, int flags, struct ib_umem *new_umem,
1733 u64 iova, unsigned long page_size)
1734 {
1735 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1736 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE;
1737 struct ib_umem *old_umem = mr->umem;
1738 int err;
1739
1740 /*
1741 * To keep everything simple the MR is revoked before we start to mess
1742 * with it. This ensure the change is atomic relative to any use of the
1743 * MR.
1744 */
1745 err = revoke_mr(mr);
1746 if (err)
1747 return err;
1748
1749 if (flags & IB_MR_REREG_PD) {
1750 mr->ibmr.pd = pd;
1751 upd_flags |= MLX5_IB_UPD_XLT_PD;
1752 }
1753 if (flags & IB_MR_REREG_ACCESS) {
1754 mr->access_flags = access_flags;
1755 upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1756 }
1757
1758 mr->ibmr.length = new_umem->length;
1759 mr->ibmr.iova = iova;
1760 mr->ibmr.length = new_umem->length;
1761 mr->page_shift = order_base_2(page_size);
1762 mr->umem = new_umem;
1763 err = mlx5_ib_update_mr_pas(mr, upd_flags);
1764 if (err) {
1765 /*
1766 * The MR is revoked at this point so there is no issue to free
1767 * new_umem.
1768 */
1769 mr->umem = old_umem;
1770 return err;
1771 }
1772
1773 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages);
1774 ib_umem_release(old_umem);
1775 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages);
1776 return 0;
1777 }
1778
mlx5_ib_rereg_user_mr(struct ib_mr * ib_mr,int flags,u64 start,u64 length,u64 iova,int new_access_flags,struct ib_pd * new_pd,struct ib_udata * udata)1779 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1780 u64 length, u64 iova, int new_access_flags,
1781 struct ib_pd *new_pd,
1782 struct ib_udata *udata)
1783 {
1784 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1785 struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1786 int err;
1787
1788 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1789 return ERR_PTR(-EOPNOTSUPP);
1790
1791 mlx5_ib_dbg(
1792 dev,
1793 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1794 start, iova, length, new_access_flags);
1795
1796 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
1797 return ERR_PTR(-EOPNOTSUPP);
1798
1799 if (!(flags & IB_MR_REREG_ACCESS))
1800 new_access_flags = mr->access_flags;
1801 if (!(flags & IB_MR_REREG_PD))
1802 new_pd = ib_mr->pd;
1803
1804 if (!(flags & IB_MR_REREG_TRANS)) {
1805 struct ib_umem *umem;
1806
1807 /* Fast path for PD/access change */
1808 if (can_use_umr_rereg_access(dev, mr->access_flags,
1809 new_access_flags)) {
1810 err = umr_rereg_pd_access(mr, new_pd, new_access_flags);
1811 if (err)
1812 return ERR_PTR(err);
1813 return NULL;
1814 }
1815 /* DM or ODP MR's don't have a normal umem so we can't re-use it */
1816 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1817 goto recreate;
1818
1819 /*
1820 * Only one active MR can refer to a umem at one time, revoke
1821 * the old MR before assigning the umem to the new one.
1822 */
1823 err = revoke_mr(mr);
1824 if (err)
1825 return ERR_PTR(err);
1826 umem = mr->umem;
1827 mr->umem = NULL;
1828 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1829
1830 return create_real_mr(new_pd, umem, mr->ibmr.iova,
1831 new_access_flags);
1832 }
1833
1834 /*
1835 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
1836 * but the logic around releasing the umem is different
1837 */
1838 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1839 goto recreate;
1840
1841 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
1842 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) {
1843 struct ib_umem *new_umem;
1844 unsigned long page_size;
1845
1846 new_umem = ib_umem_get(&dev->ib_dev, start, length,
1847 new_access_flags);
1848 if (IS_ERR(new_umem))
1849 return ERR_CAST(new_umem);
1850
1851 /* Fast path for PAS change */
1852 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova,
1853 &page_size)) {
1854 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags,
1855 new_umem, iova, page_size);
1856 if (err) {
1857 ib_umem_release(new_umem);
1858 return ERR_PTR(err);
1859 }
1860 return NULL;
1861 }
1862 return create_real_mr(new_pd, new_umem, iova, new_access_flags);
1863 }
1864
1865 /*
1866 * Everything else has no state we can preserve, just create a new MR
1867 * from scratch
1868 */
1869 recreate:
1870 return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
1871 new_access_flags, udata);
1872 }
1873
1874 static int
mlx5_alloc_priv_descs(struct ib_device * device,struct mlx5_ib_mr * mr,int ndescs,int desc_size)1875 mlx5_alloc_priv_descs(struct ib_device *device,
1876 struct mlx5_ib_mr *mr,
1877 int ndescs,
1878 int desc_size)
1879 {
1880 struct mlx5_ib_dev *dev = to_mdev(device);
1881 struct device *ddev = &dev->mdev->pdev->dev;
1882 int size = ndescs * desc_size;
1883 int add_size;
1884 int ret;
1885
1886 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1887
1888 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1889 if (!mr->descs_alloc)
1890 return -ENOMEM;
1891
1892 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1893
1894 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE);
1895 if (dma_mapping_error(ddev, mr->desc_map)) {
1896 ret = -ENOMEM;
1897 goto err;
1898 }
1899
1900 return 0;
1901 err:
1902 kfree(mr->descs_alloc);
1903
1904 return ret;
1905 }
1906
1907 static void
mlx5_free_priv_descs(struct mlx5_ib_mr * mr)1908 mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1909 {
1910 if (!mr->umem && mr->descs) {
1911 struct ib_device *device = mr->ibmr.device;
1912 int size = mr->max_descs * mr->desc_size;
1913 struct mlx5_ib_dev *dev = to_mdev(device);
1914
1915 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
1916 DMA_TO_DEVICE);
1917 kfree(mr->descs_alloc);
1918 mr->descs = NULL;
1919 }
1920 }
1921
mlx5_ib_dereg_mr(struct ib_mr * ibmr,struct ib_udata * udata)1922 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
1923 {
1924 struct mlx5_ib_mr *mr = to_mmr(ibmr);
1925 struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1926 int rc;
1927
1928 /*
1929 * Any async use of the mr must hold the refcount, once the refcount
1930 * goes to zero no other thread, such as ODP page faults, prefetch, any
1931 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
1932 */
1933 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
1934 refcount_read(&mr->mmkey.usecount) != 0 &&
1935 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)))
1936 mlx5r_deref_wait_odp_mkey(&mr->mmkey);
1937
1938 if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
1939 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
1940 mr->sig, NULL, GFP_KERNEL);
1941
1942 if (mr->mtt_mr) {
1943 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
1944 if (rc)
1945 return rc;
1946 mr->mtt_mr = NULL;
1947 }
1948 if (mr->klm_mr) {
1949 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
1950 if (rc)
1951 return rc;
1952 mr->klm_mr = NULL;
1953 }
1954
1955 if (mlx5_core_destroy_psv(dev->mdev,
1956 mr->sig->psv_memory.psv_idx))
1957 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1958 mr->sig->psv_memory.psv_idx);
1959 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
1960 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1961 mr->sig->psv_wire.psv_idx);
1962 kfree(mr->sig);
1963 mr->sig = NULL;
1964 }
1965
1966 /* Stop DMA */
1967 if (mr->cache_ent) {
1968 if (revoke_mr(mr)) {
1969 spin_lock_irq(&mr->cache_ent->lock);
1970 mr->cache_ent->total_mrs--;
1971 spin_unlock_irq(&mr->cache_ent->lock);
1972 mr->cache_ent = NULL;
1973 }
1974 }
1975 if (!mr->cache_ent) {
1976 rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
1977 if (rc)
1978 return rc;
1979 }
1980
1981 if (mr->umem) {
1982 bool is_odp = is_odp_mr(mr);
1983
1984 if (!is_odp)
1985 atomic_sub(ib_umem_num_pages(mr->umem),
1986 &dev->mdev->priv.reg_pages);
1987 ib_umem_release(mr->umem);
1988 if (is_odp)
1989 mlx5_ib_free_odp_mr(mr);
1990 }
1991
1992 if (mr->cache_ent) {
1993 mlx5_mr_cache_free(dev, mr);
1994 } else {
1995 mlx5_free_priv_descs(mr);
1996 kfree(mr);
1997 }
1998 return 0;
1999 }
2000
mlx5_set_umr_free_mkey(struct ib_pd * pd,u32 * in,int ndescs,int access_mode,int page_shift)2001 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
2002 int access_mode, int page_shift)
2003 {
2004 void *mkc;
2005
2006 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2007
2008 /* This is only used from the kernel, so setting the PD is OK. */
2009 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd);
2010 MLX5_SET(mkc, mkc, free, 1);
2011 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2012 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
2013 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
2014 MLX5_SET(mkc, mkc, umr_en, 1);
2015 MLX5_SET(mkc, mkc, log_page_size, page_shift);
2016 }
2017
_mlx5_alloc_mkey_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,int desc_size,int page_shift,int access_mode,u32 * in,int inlen)2018 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2019 int ndescs, int desc_size, int page_shift,
2020 int access_mode, u32 *in, int inlen)
2021 {
2022 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2023 int err;
2024
2025 mr->access_mode = access_mode;
2026 mr->desc_size = desc_size;
2027 mr->max_descs = ndescs;
2028
2029 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
2030 if (err)
2031 return err;
2032
2033 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
2034
2035 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
2036 if (err)
2037 goto err_free_descs;
2038
2039 mr->mmkey.type = MLX5_MKEY_MR;
2040 mr->ibmr.lkey = mr->mmkey.key;
2041 mr->ibmr.rkey = mr->mmkey.key;
2042
2043 return 0;
2044
2045 err_free_descs:
2046 mlx5_free_priv_descs(mr);
2047 return err;
2048 }
2049
mlx5_ib_alloc_pi_mr(struct ib_pd * pd,u32 max_num_sg,u32 max_num_meta_sg,int desc_size,int access_mode)2050 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
2051 u32 max_num_sg, u32 max_num_meta_sg,
2052 int desc_size, int access_mode)
2053 {
2054 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2055 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
2056 int page_shift = 0;
2057 struct mlx5_ib_mr *mr;
2058 u32 *in;
2059 int err;
2060
2061 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2062 if (!mr)
2063 return ERR_PTR(-ENOMEM);
2064
2065 mr->ibmr.pd = pd;
2066 mr->ibmr.device = pd->device;
2067
2068 in = kzalloc(inlen, GFP_KERNEL);
2069 if (!in) {
2070 err = -ENOMEM;
2071 goto err_free;
2072 }
2073
2074 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2075 page_shift = PAGE_SHIFT;
2076
2077 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
2078 access_mode, in, inlen);
2079 if (err)
2080 goto err_free_in;
2081
2082 mr->umem = NULL;
2083 kfree(in);
2084
2085 return mr;
2086
2087 err_free_in:
2088 kfree(in);
2089 err_free:
2090 kfree(mr);
2091 return ERR_PTR(err);
2092 }
2093
mlx5_alloc_mem_reg_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,u32 * in,int inlen)2094 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2095 int ndescs, u32 *in, int inlen)
2096 {
2097 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
2098 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
2099 inlen);
2100 }
2101
mlx5_alloc_sg_gaps_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,u32 * in,int inlen)2102 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2103 int ndescs, u32 *in, int inlen)
2104 {
2105 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
2106 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2107 }
2108
mlx5_alloc_integrity_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int max_num_sg,int max_num_meta_sg,u32 * in,int inlen)2109 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2110 int max_num_sg, int max_num_meta_sg,
2111 u32 *in, int inlen)
2112 {
2113 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2114 u32 psv_index[2];
2115 void *mkc;
2116 int err;
2117
2118 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
2119 if (!mr->sig)
2120 return -ENOMEM;
2121
2122 /* create mem & wire PSVs */
2123 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
2124 if (err)
2125 goto err_free_sig;
2126
2127 mr->sig->psv_memory.psv_idx = psv_index[0];
2128 mr->sig->psv_wire.psv_idx = psv_index[1];
2129
2130 mr->sig->sig_status_checked = true;
2131 mr->sig->sig_err_exists = false;
2132 /* Next UMR, Arm SIGERR */
2133 ++mr->sig->sigerr_count;
2134 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2135 sizeof(struct mlx5_klm),
2136 MLX5_MKC_ACCESS_MODE_KLMS);
2137 if (IS_ERR(mr->klm_mr)) {
2138 err = PTR_ERR(mr->klm_mr);
2139 goto err_destroy_psv;
2140 }
2141 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2142 sizeof(struct mlx5_mtt),
2143 MLX5_MKC_ACCESS_MODE_MTT);
2144 if (IS_ERR(mr->mtt_mr)) {
2145 err = PTR_ERR(mr->mtt_mr);
2146 goto err_free_klm_mr;
2147 }
2148
2149 /* Set bsf descriptors for mkey */
2150 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2151 MLX5_SET(mkc, mkc, bsf_en, 1);
2152 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
2153
2154 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
2155 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2156 if (err)
2157 goto err_free_mtt_mr;
2158
2159 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2160 mr->sig, GFP_KERNEL));
2161 if (err)
2162 goto err_free_descs;
2163 return 0;
2164
2165 err_free_descs:
2166 destroy_mkey(dev, mr);
2167 mlx5_free_priv_descs(mr);
2168 err_free_mtt_mr:
2169 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2170 mr->mtt_mr = NULL;
2171 err_free_klm_mr:
2172 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2173 mr->klm_mr = NULL;
2174 err_destroy_psv:
2175 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
2176 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2177 mr->sig->psv_memory.psv_idx);
2178 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2179 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2180 mr->sig->psv_wire.psv_idx);
2181 err_free_sig:
2182 kfree(mr->sig);
2183
2184 return err;
2185 }
2186
__mlx5_ib_alloc_mr(struct ib_pd * pd,enum ib_mr_type mr_type,u32 max_num_sg,u32 max_num_meta_sg)2187 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
2188 enum ib_mr_type mr_type, u32 max_num_sg,
2189 u32 max_num_meta_sg)
2190 {
2191 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2192 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2193 int ndescs = ALIGN(max_num_sg, 4);
2194 struct mlx5_ib_mr *mr;
2195 u32 *in;
2196 int err;
2197
2198 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2199 if (!mr)
2200 return ERR_PTR(-ENOMEM);
2201
2202 in = kzalloc(inlen, GFP_KERNEL);
2203 if (!in) {
2204 err = -ENOMEM;
2205 goto err_free;
2206 }
2207
2208 mr->ibmr.device = pd->device;
2209 mr->umem = NULL;
2210
2211 switch (mr_type) {
2212 case IB_MR_TYPE_MEM_REG:
2213 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
2214 break;
2215 case IB_MR_TYPE_SG_GAPS:
2216 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
2217 break;
2218 case IB_MR_TYPE_INTEGRITY:
2219 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
2220 max_num_meta_sg, in, inlen);
2221 break;
2222 default:
2223 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
2224 err = -EINVAL;
2225 }
2226
2227 if (err)
2228 goto err_free_in;
2229
2230 kfree(in);
2231
2232 return &mr->ibmr;
2233
2234 err_free_in:
2235 kfree(in);
2236 err_free:
2237 kfree(mr);
2238 return ERR_PTR(err);
2239 }
2240
mlx5_ib_alloc_mr(struct ib_pd * pd,enum ib_mr_type mr_type,u32 max_num_sg)2241 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2242 u32 max_num_sg)
2243 {
2244 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
2245 }
2246
mlx5_ib_alloc_mr_integrity(struct ib_pd * pd,u32 max_num_sg,u32 max_num_meta_sg)2247 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
2248 u32 max_num_sg, u32 max_num_meta_sg)
2249 {
2250 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
2251 max_num_meta_sg);
2252 }
2253
mlx5_ib_alloc_mw(struct ib_mw * ibmw,struct ib_udata * udata)2254 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
2255 {
2256 struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
2257 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2258 struct mlx5_ib_mw *mw = to_mmw(ibmw);
2259 unsigned int ndescs;
2260 u32 *in = NULL;
2261 void *mkc;
2262 int err;
2263 struct mlx5_ib_alloc_mw req = {};
2264 struct {
2265 __u32 comp_mask;
2266 __u32 response_length;
2267 } resp = {};
2268
2269 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
2270 if (err)
2271 return err;
2272
2273 if (req.comp_mask || req.reserved1 || req.reserved2)
2274 return -EOPNOTSUPP;
2275
2276 if (udata->inlen > sizeof(req) &&
2277 !ib_is_udata_cleared(udata, sizeof(req),
2278 udata->inlen - sizeof(req)))
2279 return -EOPNOTSUPP;
2280
2281 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
2282
2283 in = kzalloc(inlen, GFP_KERNEL);
2284 if (!in) {
2285 err = -ENOMEM;
2286 goto free;
2287 }
2288
2289 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2290
2291 MLX5_SET(mkc, mkc, free, 1);
2292 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2293 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
2294 MLX5_SET(mkc, mkc, umr_en, 1);
2295 MLX5_SET(mkc, mkc, lr, 1);
2296 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
2297 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
2298 MLX5_SET(mkc, mkc, qpn, 0xffffff);
2299
2300 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
2301 if (err)
2302 goto free;
2303
2304 mw->mmkey.type = MLX5_MKEY_MW;
2305 ibmw->rkey = mw->mmkey.key;
2306 mw->mmkey.ndescs = ndescs;
2307
2308 resp.response_length =
2309 min(offsetofend(typeof(resp), response_length), udata->outlen);
2310 if (resp.response_length) {
2311 err = ib_copy_to_udata(udata, &resp, resp.response_length);
2312 if (err)
2313 goto free_mkey;
2314 }
2315
2316 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2317 err = mlx5r_store_odp_mkey(dev, &mw->mmkey);
2318 if (err)
2319 goto free_mkey;
2320 }
2321
2322 kfree(in);
2323 return 0;
2324
2325 free_mkey:
2326 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key);
2327 free:
2328 kfree(in);
2329 return err;
2330 }
2331
mlx5_ib_dealloc_mw(struct ib_mw * mw)2332 int mlx5_ib_dealloc_mw(struct ib_mw *mw)
2333 {
2334 struct mlx5_ib_dev *dev = to_mdev(mw->device);
2335 struct mlx5_ib_mw *mmw = to_mmw(mw);
2336
2337 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2338 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)))
2339 /*
2340 * pagefault_single_data_segment() may be accessing mmw
2341 * if the user bound an ODP MR to this MW.
2342 */
2343 mlx5r_deref_wait_odp_mkey(&mmw->mmkey);
2344
2345 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key);
2346 }
2347
mlx5_ib_check_mr_status(struct ib_mr * ibmr,u32 check_mask,struct ib_mr_status * mr_status)2348 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
2349 struct ib_mr_status *mr_status)
2350 {
2351 struct mlx5_ib_mr *mmr = to_mmr(ibmr);
2352 int ret = 0;
2353
2354 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
2355 pr_err("Invalid status check mask\n");
2356 ret = -EINVAL;
2357 goto done;
2358 }
2359
2360 mr_status->fail_status = 0;
2361 if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2362 if (!mmr->sig) {
2363 ret = -EINVAL;
2364 pr_err("signature status check requested on a non-signature enabled MR\n");
2365 goto done;
2366 }
2367
2368 mmr->sig->sig_status_checked = true;
2369 if (!mmr->sig->sig_err_exists)
2370 goto done;
2371
2372 if (ibmr->lkey == mmr->sig->err_item.key)
2373 memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2374 sizeof(mr_status->sig_err));
2375 else {
2376 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2377 mr_status->sig_err.sig_err_offset = 0;
2378 mr_status->sig_err.key = mmr->sig->err_item.key;
2379 }
2380
2381 mmr->sig->sig_err_exists = false;
2382 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2383 }
2384
2385 done:
2386 return ret;
2387 }
2388
2389 static int
mlx5_ib_map_pa_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2390 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2391 int data_sg_nents, unsigned int *data_sg_offset,
2392 struct scatterlist *meta_sg, int meta_sg_nents,
2393 unsigned int *meta_sg_offset)
2394 {
2395 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2396 unsigned int sg_offset = 0;
2397 int n = 0;
2398
2399 mr->meta_length = 0;
2400 if (data_sg_nents == 1) {
2401 n++;
2402 mr->mmkey.ndescs = 1;
2403 if (data_sg_offset)
2404 sg_offset = *data_sg_offset;
2405 mr->data_length = sg_dma_len(data_sg) - sg_offset;
2406 mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2407 if (meta_sg_nents == 1) {
2408 n++;
2409 mr->meta_ndescs = 1;
2410 if (meta_sg_offset)
2411 sg_offset = *meta_sg_offset;
2412 else
2413 sg_offset = 0;
2414 mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2415 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2416 }
2417 ibmr->length = mr->data_length + mr->meta_length;
2418 }
2419
2420 return n;
2421 }
2422
2423 static int
mlx5_ib_sg_to_klms(struct mlx5_ib_mr * mr,struct scatterlist * sgl,unsigned short sg_nents,unsigned int * sg_offset_p,struct scatterlist * meta_sgl,unsigned short meta_sg_nents,unsigned int * meta_sg_offset_p)2424 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2425 struct scatterlist *sgl,
2426 unsigned short sg_nents,
2427 unsigned int *sg_offset_p,
2428 struct scatterlist *meta_sgl,
2429 unsigned short meta_sg_nents,
2430 unsigned int *meta_sg_offset_p)
2431 {
2432 struct scatterlist *sg = sgl;
2433 struct mlx5_klm *klms = mr->descs;
2434 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2435 u32 lkey = mr->ibmr.pd->local_dma_lkey;
2436 int i, j = 0;
2437
2438 mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2439 mr->ibmr.length = 0;
2440
2441 for_each_sg(sgl, sg, sg_nents, i) {
2442 if (unlikely(i >= mr->max_descs))
2443 break;
2444 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2445 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2446 klms[i].key = cpu_to_be32(lkey);
2447 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2448
2449 sg_offset = 0;
2450 }
2451
2452 if (sg_offset_p)
2453 *sg_offset_p = sg_offset;
2454
2455 mr->mmkey.ndescs = i;
2456 mr->data_length = mr->ibmr.length;
2457
2458 if (meta_sg_nents) {
2459 sg = meta_sgl;
2460 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2461 for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2462 if (unlikely(i + j >= mr->max_descs))
2463 break;
2464 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2465 sg_offset);
2466 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2467 sg_offset);
2468 klms[i + j].key = cpu_to_be32(lkey);
2469 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2470
2471 sg_offset = 0;
2472 }
2473 if (meta_sg_offset_p)
2474 *meta_sg_offset_p = sg_offset;
2475
2476 mr->meta_ndescs = j;
2477 mr->meta_length = mr->ibmr.length - mr->data_length;
2478 }
2479
2480 return i + j;
2481 }
2482
mlx5_set_page(struct ib_mr * ibmr,u64 addr)2483 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2484 {
2485 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2486 __be64 *descs;
2487
2488 if (unlikely(mr->mmkey.ndescs == mr->max_descs))
2489 return -ENOMEM;
2490
2491 descs = mr->descs;
2492 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2493
2494 return 0;
2495 }
2496
mlx5_set_page_pi(struct ib_mr * ibmr,u64 addr)2497 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2498 {
2499 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2500 __be64 *descs;
2501
2502 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs))
2503 return -ENOMEM;
2504
2505 descs = mr->descs;
2506 descs[mr->mmkey.ndescs + mr->meta_ndescs++] =
2507 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2508
2509 return 0;
2510 }
2511
2512 static int
mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2513 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2514 int data_sg_nents, unsigned int *data_sg_offset,
2515 struct scatterlist *meta_sg, int meta_sg_nents,
2516 unsigned int *meta_sg_offset)
2517 {
2518 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2519 struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2520 int n;
2521
2522 pi_mr->mmkey.ndescs = 0;
2523 pi_mr->meta_ndescs = 0;
2524 pi_mr->meta_length = 0;
2525
2526 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2527 pi_mr->desc_size * pi_mr->max_descs,
2528 DMA_TO_DEVICE);
2529
2530 pi_mr->ibmr.page_size = ibmr->page_size;
2531 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2532 mlx5_set_page);
2533 if (n != data_sg_nents)
2534 return n;
2535
2536 pi_mr->data_iova = pi_mr->ibmr.iova;
2537 pi_mr->data_length = pi_mr->ibmr.length;
2538 pi_mr->ibmr.length = pi_mr->data_length;
2539 ibmr->length = pi_mr->data_length;
2540
2541 if (meta_sg_nents) {
2542 u64 page_mask = ~((u64)ibmr->page_size - 1);
2543 u64 iova = pi_mr->data_iova;
2544
2545 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2546 meta_sg_offset, mlx5_set_page_pi);
2547
2548 pi_mr->meta_length = pi_mr->ibmr.length;
2549 /*
2550 * PI address for the HW is the offset of the metadata address
2551 * relative to the first data page address.
2552 * It equals to first data page address + size of data pages +
2553 * metadata offset at the first metadata page
2554 */
2555 pi_mr->pi_iova = (iova & page_mask) +
2556 pi_mr->mmkey.ndescs * ibmr->page_size +
2557 (pi_mr->ibmr.iova & ~page_mask);
2558 /*
2559 * In order to use one MTT MR for data and metadata, we register
2560 * also the gaps between the end of the data and the start of
2561 * the metadata (the sig MR will verify that the HW will access
2562 * to right addresses). This mapping is safe because we use
2563 * internal mkey for the registration.
2564 */
2565 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2566 pi_mr->ibmr.iova = iova;
2567 ibmr->length += pi_mr->meta_length;
2568 }
2569
2570 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2571 pi_mr->desc_size * pi_mr->max_descs,
2572 DMA_TO_DEVICE);
2573
2574 return n;
2575 }
2576
2577 static int
mlx5_ib_map_klm_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2578 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2579 int data_sg_nents, unsigned int *data_sg_offset,
2580 struct scatterlist *meta_sg, int meta_sg_nents,
2581 unsigned int *meta_sg_offset)
2582 {
2583 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2584 struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2585 int n;
2586
2587 pi_mr->mmkey.ndescs = 0;
2588 pi_mr->meta_ndescs = 0;
2589 pi_mr->meta_length = 0;
2590
2591 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2592 pi_mr->desc_size * pi_mr->max_descs,
2593 DMA_TO_DEVICE);
2594
2595 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2596 meta_sg, meta_sg_nents, meta_sg_offset);
2597
2598 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2599 pi_mr->desc_size * pi_mr->max_descs,
2600 DMA_TO_DEVICE);
2601
2602 /* This is zero-based memory region */
2603 pi_mr->data_iova = 0;
2604 pi_mr->ibmr.iova = 0;
2605 pi_mr->pi_iova = pi_mr->data_length;
2606 ibmr->length = pi_mr->ibmr.length;
2607
2608 return n;
2609 }
2610
mlx5_ib_map_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2611 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2612 int data_sg_nents, unsigned int *data_sg_offset,
2613 struct scatterlist *meta_sg, int meta_sg_nents,
2614 unsigned int *meta_sg_offset)
2615 {
2616 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2617 struct mlx5_ib_mr *pi_mr = NULL;
2618 int n;
2619
2620 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2621
2622 mr->mmkey.ndescs = 0;
2623 mr->data_length = 0;
2624 mr->data_iova = 0;
2625 mr->meta_ndescs = 0;
2626 mr->pi_iova = 0;
2627 /*
2628 * As a performance optimization, if possible, there is no need to
2629 * perform UMR operation to register the data/metadata buffers.
2630 * First try to map the sg lists to PA descriptors with local_dma_lkey.
2631 * Fallback to UMR only in case of a failure.
2632 */
2633 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2634 data_sg_offset, meta_sg, meta_sg_nents,
2635 meta_sg_offset);
2636 if (n == data_sg_nents + meta_sg_nents)
2637 goto out;
2638 /*
2639 * As a performance optimization, if possible, there is no need to map
2640 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2641 * descriptors and fallback to KLM only in case of a failure.
2642 * It's more efficient for the HW to work with MTT descriptors
2643 * (especially in high load).
2644 * Use KLM (indirect access) only if it's mandatory.
2645 */
2646 pi_mr = mr->mtt_mr;
2647 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2648 data_sg_offset, meta_sg, meta_sg_nents,
2649 meta_sg_offset);
2650 if (n == data_sg_nents + meta_sg_nents)
2651 goto out;
2652
2653 pi_mr = mr->klm_mr;
2654 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2655 data_sg_offset, meta_sg, meta_sg_nents,
2656 meta_sg_offset);
2657 if (unlikely(n != data_sg_nents + meta_sg_nents))
2658 return -ENOMEM;
2659
2660 out:
2661 /* This is zero-based memory region */
2662 ibmr->iova = 0;
2663 mr->pi_mr = pi_mr;
2664 if (pi_mr)
2665 ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2666 else
2667 ibmr->sig_attrs->meta_length = mr->meta_length;
2668
2669 return 0;
2670 }
2671
mlx5_ib_map_mr_sg(struct ib_mr * ibmr,struct scatterlist * sg,int sg_nents,unsigned int * sg_offset)2672 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2673 unsigned int *sg_offset)
2674 {
2675 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2676 int n;
2677
2678 mr->mmkey.ndescs = 0;
2679
2680 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2681 mr->desc_size * mr->max_descs,
2682 DMA_TO_DEVICE);
2683
2684 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2685 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2686 NULL);
2687 else
2688 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2689 mlx5_set_page);
2690
2691 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2692 mr->desc_size * mr->max_descs,
2693 DMA_TO_DEVICE);
2694
2695 return n;
2696 }
2697