1 /*
2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3 * Copyright (c) 2020, Intel Corporation. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
13 * conditions are met:
14 *
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
17 * disclaimer.
18 *
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34
35 #include <linux/kref.h>
36 #include <linux/random.h>
37 #include <linux/debugfs.h>
38 #include <linux/export.h>
39 #include <linux/delay.h>
40 #include <linux/dma-buf.h>
41 #include <linux/dma-resv.h>
42 #include <rdma/ib_umem_odp.h>
43 #include "dm.h"
44 #include "mlx5_ib.h"
45 #include "umr.h"
46
47 enum {
48 MAX_PENDING_REG_MR = 8,
49 };
50
51 #define MLX5_UMR_ALIGN 2048
52
53 static void
54 create_mkey_callback(int status, struct mlx5_async_work *context);
55 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
56 u64 iova, int access_flags,
57 unsigned int page_size, bool populate);
58
set_mkc_access_pd_addr_fields(void * mkc,int acc,u64 start_addr,struct ib_pd * pd)59 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
60 struct ib_pd *pd)
61 {
62 struct mlx5_ib_dev *dev = to_mdev(pd->device);
63
64 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
65 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
66 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
67 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
68 MLX5_SET(mkc, mkc, lr, 1);
69
70 if ((acc & IB_ACCESS_RELAXED_ORDERING) &&
71 pcie_relaxed_ordering_enabled(dev->mdev->pdev)) {
72 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
73 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1);
74 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))
75 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1);
76 }
77
78 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
79 MLX5_SET(mkc, mkc, qpn, 0xffffff);
80 MLX5_SET64(mkc, mkc, start_addr, start_addr);
81 }
82
assign_mkey_variant(struct mlx5_ib_dev * dev,u32 * mkey,u32 * in)83 static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in)
84 {
85 u8 key = atomic_inc_return(&dev->mkey_var);
86 void *mkc;
87
88 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
89 MLX5_SET(mkc, mkc, mkey_7_0, key);
90 *mkey = key;
91 }
92
mlx5_ib_create_mkey(struct mlx5_ib_dev * dev,struct mlx5_ib_mkey * mkey,u32 * in,int inlen)93 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
94 struct mlx5_ib_mkey *mkey, u32 *in, int inlen)
95 {
96 int ret;
97
98 assign_mkey_variant(dev, &mkey->key, in);
99 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen);
100 if (!ret)
101 init_waitqueue_head(&mkey->wait);
102
103 return ret;
104 }
105
mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey * async_create)106 static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create)
107 {
108 struct mlx5_ib_dev *dev = async_create->ent->dev;
109 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
110 size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out);
111
112 MLX5_SET(create_mkey_in, async_create->in, opcode,
113 MLX5_CMD_OP_CREATE_MKEY);
114 assign_mkey_variant(dev, &async_create->mkey, async_create->in);
115 return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen,
116 async_create->out, outlen, create_mkey_callback,
117 &async_create->cb_work);
118 }
119
120 static int mkey_cache_max_order(struct mlx5_ib_dev *dev);
121 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
122
destroy_mkey(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)123 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
124 {
125 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
126
127 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
128 }
129
create_mkey_warn(struct mlx5_ib_dev * dev,int status,void * out)130 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
131 {
132 if (status == -ENXIO) /* core driver is not available */
133 return;
134
135 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
136 if (status != -EREMOTEIO) /* driver specific failure */
137 return;
138
139 /* Failed in FW, print cmd out failure details */
140 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
141 }
142
push_mkey_locked(struct mlx5_cache_ent * ent,bool limit_pendings,void * to_store)143 static int push_mkey_locked(struct mlx5_cache_ent *ent, bool limit_pendings,
144 void *to_store)
145 {
146 XA_STATE(xas, &ent->mkeys, 0);
147 void *curr;
148
149 if (limit_pendings &&
150 (ent->reserved - ent->stored) > MAX_PENDING_REG_MR)
151 return -EAGAIN;
152
153 while (1) {
154 /*
155 * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version
156 * doesn't transparently unlock. Instead we set the xas index to
157 * the current value of reserved every iteration.
158 */
159 xas_set(&xas, ent->reserved);
160 curr = xas_load(&xas);
161 if (!curr) {
162 if (to_store && ent->stored == ent->reserved)
163 xas_store(&xas, to_store);
164 else
165 xas_store(&xas, XA_ZERO_ENTRY);
166 if (xas_valid(&xas)) {
167 ent->reserved++;
168 if (to_store) {
169 if (ent->stored != ent->reserved)
170 __xa_store(&ent->mkeys,
171 ent->stored,
172 to_store,
173 GFP_KERNEL);
174 ent->stored++;
175 queue_adjust_cache_locked(ent);
176 WRITE_ONCE(ent->dev->cache.last_add,
177 jiffies);
178 }
179 }
180 }
181 xa_unlock_irq(&ent->mkeys);
182
183 /*
184 * Notice xas_nomem() must always be called as it cleans
185 * up any cached allocation.
186 */
187 if (!xas_nomem(&xas, GFP_KERNEL))
188 break;
189 xa_lock_irq(&ent->mkeys);
190 }
191 xa_lock_irq(&ent->mkeys);
192 if (xas_error(&xas))
193 return xas_error(&xas);
194 if (WARN_ON(curr))
195 return -EINVAL;
196 return 0;
197 }
198
push_mkey(struct mlx5_cache_ent * ent,bool limit_pendings,void * to_store)199 static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
200 void *to_store)
201 {
202 int ret;
203
204 xa_lock_irq(&ent->mkeys);
205 ret = push_mkey_locked(ent, limit_pendings, to_store);
206 xa_unlock_irq(&ent->mkeys);
207 return ret;
208 }
209
undo_push_reserve_mkey(struct mlx5_cache_ent * ent)210 static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent)
211 {
212 void *old;
213
214 ent->reserved--;
215 old = __xa_erase(&ent->mkeys, ent->reserved);
216 WARN_ON(old);
217 }
218
push_to_reserved(struct mlx5_cache_ent * ent,u32 mkey)219 static void push_to_reserved(struct mlx5_cache_ent *ent, u32 mkey)
220 {
221 void *old;
222
223 old = __xa_store(&ent->mkeys, ent->stored, xa_mk_value(mkey), 0);
224 WARN_ON(old);
225 ent->stored++;
226 }
227
pop_stored_mkey(struct mlx5_cache_ent * ent)228 static u32 pop_stored_mkey(struct mlx5_cache_ent *ent)
229 {
230 void *old, *xa_mkey;
231
232 ent->stored--;
233 ent->reserved--;
234
235 if (ent->stored == ent->reserved) {
236 xa_mkey = __xa_erase(&ent->mkeys, ent->stored);
237 WARN_ON(!xa_mkey);
238 return (u32)xa_to_value(xa_mkey);
239 }
240
241 xa_mkey = __xa_store(&ent->mkeys, ent->stored, XA_ZERO_ENTRY,
242 GFP_KERNEL);
243 WARN_ON(!xa_mkey || xa_is_err(xa_mkey));
244 old = __xa_erase(&ent->mkeys, ent->reserved);
245 WARN_ON(old);
246 return (u32)xa_to_value(xa_mkey);
247 }
248
create_mkey_callback(int status,struct mlx5_async_work * context)249 static void create_mkey_callback(int status, struct mlx5_async_work *context)
250 {
251 struct mlx5r_async_create_mkey *mkey_out =
252 container_of(context, struct mlx5r_async_create_mkey, cb_work);
253 struct mlx5_cache_ent *ent = mkey_out->ent;
254 struct mlx5_ib_dev *dev = ent->dev;
255 unsigned long flags;
256
257 if (status) {
258 create_mkey_warn(dev, status, mkey_out->out);
259 kfree(mkey_out);
260 xa_lock_irqsave(&ent->mkeys, flags);
261 undo_push_reserve_mkey(ent);
262 WRITE_ONCE(dev->fill_delay, 1);
263 xa_unlock_irqrestore(&ent->mkeys, flags);
264 mod_timer(&dev->delay_timer, jiffies + HZ);
265 return;
266 }
267
268 mkey_out->mkey |= mlx5_idx_to_mkey(
269 MLX5_GET(create_mkey_out, mkey_out->out, mkey_index));
270 WRITE_ONCE(dev->cache.last_add, jiffies);
271
272 xa_lock_irqsave(&ent->mkeys, flags);
273 push_to_reserved(ent, mkey_out->mkey);
274 /* If we are doing fill_to_high_water then keep going. */
275 queue_adjust_cache_locked(ent);
276 xa_unlock_irqrestore(&ent->mkeys, flags);
277 kfree(mkey_out);
278 }
279
get_mkc_octo_size(unsigned int access_mode,unsigned int ndescs)280 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
281 {
282 int ret = 0;
283
284 switch (access_mode) {
285 case MLX5_MKC_ACCESS_MODE_MTT:
286 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
287 sizeof(struct mlx5_mtt));
288 break;
289 case MLX5_MKC_ACCESS_MODE_KSM:
290 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
291 sizeof(struct mlx5_klm));
292 break;
293 default:
294 WARN_ON(1);
295 }
296 return ret;
297 }
298
set_cache_mkc(struct mlx5_cache_ent * ent,void * mkc)299 static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
300 {
301 set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
302 MLX5_SET(mkc, mkc, free, 1);
303 MLX5_SET(mkc, mkc, umr_en, 1);
304 MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3);
305 MLX5_SET(mkc, mkc, access_mode_4_2,
306 (ent->rb_key.access_mode >> 2) & 0x7);
307
308 MLX5_SET(mkc, mkc, translations_octword_size,
309 get_mkc_octo_size(ent->rb_key.access_mode,
310 ent->rb_key.ndescs));
311 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
312 }
313
314 /* Asynchronously schedule new MRs to be populated in the cache. */
add_keys(struct mlx5_cache_ent * ent,unsigned int num)315 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
316 {
317 struct mlx5r_async_create_mkey *async_create;
318 void *mkc;
319 int err = 0;
320 int i;
321
322 for (i = 0; i < num; i++) {
323 async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey),
324 GFP_KERNEL);
325 if (!async_create)
326 return -ENOMEM;
327 mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in,
328 memory_key_mkey_entry);
329 set_cache_mkc(ent, mkc);
330 async_create->ent = ent;
331
332 err = push_mkey(ent, true, NULL);
333 if (err)
334 goto free_async_create;
335
336 err = mlx5_ib_create_mkey_cb(async_create);
337 if (err) {
338 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
339 goto err_undo_reserve;
340 }
341 }
342
343 return 0;
344
345 err_undo_reserve:
346 xa_lock_irq(&ent->mkeys);
347 undo_push_reserve_mkey(ent);
348 xa_unlock_irq(&ent->mkeys);
349 free_async_create:
350 kfree(async_create);
351 return err;
352 }
353
354 /* Synchronously create a MR in the cache */
create_cache_mkey(struct mlx5_cache_ent * ent,u32 * mkey)355 static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey)
356 {
357 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
358 void *mkc;
359 u32 *in;
360 int err;
361
362 in = kzalloc(inlen, GFP_KERNEL);
363 if (!in)
364 return -ENOMEM;
365 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
366 set_cache_mkc(ent, mkc);
367
368 err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen);
369 if (err)
370 goto free_in;
371
372 WRITE_ONCE(ent->dev->cache.last_add, jiffies);
373 free_in:
374 kfree(in);
375 return err;
376 }
377
remove_cache_mr_locked(struct mlx5_cache_ent * ent)378 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
379 {
380 u32 mkey;
381
382 lockdep_assert_held(&ent->mkeys.xa_lock);
383 if (!ent->stored)
384 return;
385 mkey = pop_stored_mkey(ent);
386 xa_unlock_irq(&ent->mkeys);
387 mlx5_core_destroy_mkey(ent->dev->mdev, mkey);
388 xa_lock_irq(&ent->mkeys);
389 }
390
resize_available_mrs(struct mlx5_cache_ent * ent,unsigned int target,bool limit_fill)391 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
392 bool limit_fill)
393 __acquires(&ent->mkeys) __releases(&ent->mkeys)
394 {
395 int err;
396
397 lockdep_assert_held(&ent->mkeys.xa_lock);
398
399 while (true) {
400 if (limit_fill)
401 target = ent->limit * 2;
402 if (target == ent->reserved)
403 return 0;
404 if (target > ent->reserved) {
405 u32 todo = target - ent->reserved;
406
407 xa_unlock_irq(&ent->mkeys);
408 err = add_keys(ent, todo);
409 if (err == -EAGAIN)
410 usleep_range(3000, 5000);
411 xa_lock_irq(&ent->mkeys);
412 if (err) {
413 if (err != -EAGAIN)
414 return err;
415 } else
416 return 0;
417 } else {
418 remove_cache_mr_locked(ent);
419 }
420 }
421 }
422
size_write(struct file * filp,const char __user * buf,size_t count,loff_t * pos)423 static ssize_t size_write(struct file *filp, const char __user *buf,
424 size_t count, loff_t *pos)
425 {
426 struct mlx5_cache_ent *ent = filp->private_data;
427 u32 target;
428 int err;
429
430 err = kstrtou32_from_user(buf, count, 0, &target);
431 if (err)
432 return err;
433
434 /*
435 * Target is the new value of total_mrs the user requests, however we
436 * cannot free MRs that are in use. Compute the target value for stored
437 * mkeys.
438 */
439 xa_lock_irq(&ent->mkeys);
440 if (target < ent->in_use) {
441 err = -EINVAL;
442 goto err_unlock;
443 }
444 target = target - ent->in_use;
445 if (target < ent->limit || target > ent->limit*2) {
446 err = -EINVAL;
447 goto err_unlock;
448 }
449 err = resize_available_mrs(ent, target, false);
450 if (err)
451 goto err_unlock;
452 xa_unlock_irq(&ent->mkeys);
453
454 return count;
455
456 err_unlock:
457 xa_unlock_irq(&ent->mkeys);
458 return err;
459 }
460
size_read(struct file * filp,char __user * buf,size_t count,loff_t * pos)461 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
462 loff_t *pos)
463 {
464 struct mlx5_cache_ent *ent = filp->private_data;
465 char lbuf[20];
466 int err;
467
468 err = snprintf(lbuf, sizeof(lbuf), "%ld\n", ent->stored + ent->in_use);
469 if (err < 0)
470 return err;
471
472 return simple_read_from_buffer(buf, count, pos, lbuf, err);
473 }
474
475 static const struct file_operations size_fops = {
476 .owner = THIS_MODULE,
477 .open = simple_open,
478 .write = size_write,
479 .read = size_read,
480 };
481
limit_write(struct file * filp,const char __user * buf,size_t count,loff_t * pos)482 static ssize_t limit_write(struct file *filp, const char __user *buf,
483 size_t count, loff_t *pos)
484 {
485 struct mlx5_cache_ent *ent = filp->private_data;
486 u32 var;
487 int err;
488
489 err = kstrtou32_from_user(buf, count, 0, &var);
490 if (err)
491 return err;
492
493 /*
494 * Upon set we immediately fill the cache to high water mark implied by
495 * the limit.
496 */
497 xa_lock_irq(&ent->mkeys);
498 ent->limit = var;
499 err = resize_available_mrs(ent, 0, true);
500 xa_unlock_irq(&ent->mkeys);
501 if (err)
502 return err;
503 return count;
504 }
505
limit_read(struct file * filp,char __user * buf,size_t count,loff_t * pos)506 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
507 loff_t *pos)
508 {
509 struct mlx5_cache_ent *ent = filp->private_data;
510 char lbuf[20];
511 int err;
512
513 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
514 if (err < 0)
515 return err;
516
517 return simple_read_from_buffer(buf, count, pos, lbuf, err);
518 }
519
520 static const struct file_operations limit_fops = {
521 .owner = THIS_MODULE,
522 .open = simple_open,
523 .write = limit_write,
524 .read = limit_read,
525 };
526
someone_adding(struct mlx5_mkey_cache * cache)527 static bool someone_adding(struct mlx5_mkey_cache *cache)
528 {
529 struct mlx5_cache_ent *ent;
530 struct rb_node *node;
531 bool ret;
532
533 mutex_lock(&cache->rb_lock);
534 for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) {
535 ent = rb_entry(node, struct mlx5_cache_ent, node);
536 xa_lock_irq(&ent->mkeys);
537 ret = ent->stored < ent->limit;
538 xa_unlock_irq(&ent->mkeys);
539 if (ret) {
540 mutex_unlock(&cache->rb_lock);
541 return true;
542 }
543 }
544 mutex_unlock(&cache->rb_lock);
545 return false;
546 }
547
548 /*
549 * Check if the bucket is outside the high/low water mark and schedule an async
550 * update. The cache refill has hysteresis, once the low water mark is hit it is
551 * refilled up to the high mark.
552 */
queue_adjust_cache_locked(struct mlx5_cache_ent * ent)553 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
554 {
555 lockdep_assert_held(&ent->mkeys.xa_lock);
556
557 if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp)
558 return;
559 if (ent->stored < ent->limit) {
560 ent->fill_to_high_water = true;
561 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
562 } else if (ent->fill_to_high_water &&
563 ent->reserved < 2 * ent->limit) {
564 /*
565 * Once we start populating due to hitting a low water mark
566 * continue until we pass the high water mark.
567 */
568 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
569 } else if (ent->stored == 2 * ent->limit) {
570 ent->fill_to_high_water = false;
571 } else if (ent->stored > 2 * ent->limit) {
572 /* Queue deletion of excess entries */
573 ent->fill_to_high_water = false;
574 if (ent->stored != ent->reserved)
575 queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
576 msecs_to_jiffies(1000));
577 else
578 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
579 }
580 }
581
__cache_work_func(struct mlx5_cache_ent * ent)582 static void __cache_work_func(struct mlx5_cache_ent *ent)
583 {
584 struct mlx5_ib_dev *dev = ent->dev;
585 struct mlx5_mkey_cache *cache = &dev->cache;
586 int err;
587
588 xa_lock_irq(&ent->mkeys);
589 if (ent->disabled)
590 goto out;
591
592 if (ent->fill_to_high_water && ent->reserved < 2 * ent->limit &&
593 !READ_ONCE(dev->fill_delay)) {
594 xa_unlock_irq(&ent->mkeys);
595 err = add_keys(ent, 1);
596 xa_lock_irq(&ent->mkeys);
597 if (ent->disabled)
598 goto out;
599 if (err) {
600 /*
601 * EAGAIN only happens if there are pending MRs, so we
602 * will be rescheduled when storing them. The only
603 * failure path here is ENOMEM.
604 */
605 if (err != -EAGAIN) {
606 mlx5_ib_warn(
607 dev,
608 "add keys command failed, err %d\n",
609 err);
610 queue_delayed_work(cache->wq, &ent->dwork,
611 msecs_to_jiffies(1000));
612 }
613 }
614 } else if (ent->stored > 2 * ent->limit) {
615 bool need_delay;
616
617 /*
618 * The remove_cache_mr() logic is performed as garbage
619 * collection task. Such task is intended to be run when no
620 * other active processes are running.
621 *
622 * The need_resched() will return TRUE if there are user tasks
623 * to be activated in near future.
624 *
625 * In such case, we don't execute remove_cache_mr() and postpone
626 * the garbage collection work to try to run in next cycle, in
627 * order to free CPU resources to other tasks.
628 */
629 xa_unlock_irq(&ent->mkeys);
630 need_delay = need_resched() || someone_adding(cache) ||
631 !time_after(jiffies,
632 READ_ONCE(cache->last_add) + 300 * HZ);
633 xa_lock_irq(&ent->mkeys);
634 if (ent->disabled)
635 goto out;
636 if (need_delay) {
637 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
638 goto out;
639 }
640 remove_cache_mr_locked(ent);
641 queue_adjust_cache_locked(ent);
642 }
643 out:
644 xa_unlock_irq(&ent->mkeys);
645 }
646
delayed_cache_work_func(struct work_struct * work)647 static void delayed_cache_work_func(struct work_struct *work)
648 {
649 struct mlx5_cache_ent *ent;
650
651 ent = container_of(work, struct mlx5_cache_ent, dwork.work);
652 __cache_work_func(ent);
653 }
654
cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,struct mlx5r_cache_rb_key key2)655 static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
656 struct mlx5r_cache_rb_key key2)
657 {
658 int res;
659
660 res = key1.ats - key2.ats;
661 if (res)
662 return res;
663
664 res = key1.access_mode - key2.access_mode;
665 if (res)
666 return res;
667
668 res = key1.access_flags - key2.access_flags;
669 if (res)
670 return res;
671
672 /*
673 * keep ndescs the last in the compare table since the find function
674 * searches for an exact match on all properties and only closest
675 * match in size.
676 */
677 return key1.ndescs - key2.ndescs;
678 }
679
mlx5_cache_ent_insert(struct mlx5_mkey_cache * cache,struct mlx5_cache_ent * ent)680 static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
681 struct mlx5_cache_ent *ent)
682 {
683 struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL;
684 struct mlx5_cache_ent *cur;
685 int cmp;
686
687 /* Figure out where to put new node */
688 while (*new) {
689 cur = rb_entry(*new, struct mlx5_cache_ent, node);
690 parent = *new;
691 cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key);
692 if (cmp > 0)
693 new = &((*new)->rb_left);
694 if (cmp < 0)
695 new = &((*new)->rb_right);
696 if (cmp == 0) {
697 mutex_unlock(&cache->rb_lock);
698 return -EEXIST;
699 }
700 }
701
702 /* Add new node and rebalance tree. */
703 rb_link_node(&ent->node, parent, new);
704 rb_insert_color(&ent->node, &cache->rb_root);
705
706 return 0;
707 }
708
709 static struct mlx5_cache_ent *
mkey_cache_ent_from_rb_key(struct mlx5_ib_dev * dev,struct mlx5r_cache_rb_key rb_key)710 mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
711 struct mlx5r_cache_rb_key rb_key)
712 {
713 struct rb_node *node = dev->cache.rb_root.rb_node;
714 struct mlx5_cache_ent *cur, *smallest = NULL;
715 int cmp;
716
717 /*
718 * Find the smallest ent with order >= requested_order.
719 */
720 while (node) {
721 cur = rb_entry(node, struct mlx5_cache_ent, node);
722 cmp = cache_ent_key_cmp(cur->rb_key, rb_key);
723 if (cmp > 0) {
724 smallest = cur;
725 node = node->rb_left;
726 }
727 if (cmp < 0)
728 node = node->rb_right;
729 if (cmp == 0)
730 return cur;
731 }
732
733 return (smallest &&
734 smallest->rb_key.access_mode == rb_key.access_mode &&
735 smallest->rb_key.access_flags == rb_key.access_flags &&
736 smallest->rb_key.ats == rb_key.ats) ?
737 smallest :
738 NULL;
739 }
740
_mlx5_mr_cache_alloc(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent,int access_flags)741 static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
742 struct mlx5_cache_ent *ent,
743 int access_flags)
744 {
745 struct mlx5_ib_mr *mr;
746 int err;
747
748 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
749 if (!mr)
750 return ERR_PTR(-ENOMEM);
751
752 xa_lock_irq(&ent->mkeys);
753 ent->in_use++;
754
755 if (!ent->stored) {
756 queue_adjust_cache_locked(ent);
757 ent->miss++;
758 xa_unlock_irq(&ent->mkeys);
759 err = create_cache_mkey(ent, &mr->mmkey.key);
760 if (err) {
761 xa_lock_irq(&ent->mkeys);
762 ent->in_use--;
763 xa_unlock_irq(&ent->mkeys);
764 kfree(mr);
765 return ERR_PTR(err);
766 }
767 } else {
768 mr->mmkey.key = pop_stored_mkey(ent);
769 queue_adjust_cache_locked(ent);
770 xa_unlock_irq(&ent->mkeys);
771 }
772 mr->mmkey.cache_ent = ent;
773 mr->mmkey.type = MLX5_MKEY_MR;
774 init_waitqueue_head(&mr->mmkey.wait);
775 return mr;
776 }
777
get_unchangeable_access_flags(struct mlx5_ib_dev * dev,int access_flags)778 static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev,
779 int access_flags)
780 {
781 int ret = 0;
782
783 if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
784 MLX5_CAP_GEN(dev->mdev, atomic) &&
785 MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
786 ret |= IB_ACCESS_REMOTE_ATOMIC;
787
788 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
789 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) &&
790 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
791 ret |= IB_ACCESS_RELAXED_ORDERING;
792
793 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
794 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) &&
795 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
796 ret |= IB_ACCESS_RELAXED_ORDERING;
797
798 return ret;
799 }
800
mlx5_mr_cache_alloc(struct mlx5_ib_dev * dev,int access_flags,int access_mode,int ndescs)801 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
802 int access_flags, int access_mode,
803 int ndescs)
804 {
805 struct mlx5r_cache_rb_key rb_key = {
806 .ndescs = ndescs,
807 .access_mode = access_mode,
808 .access_flags = get_unchangeable_access_flags(dev, access_flags)
809 };
810 struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key);
811
812 if (!ent)
813 return ERR_PTR(-EOPNOTSUPP);
814
815 return _mlx5_mr_cache_alloc(dev, ent, access_flags);
816 }
817
clean_keys(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent)818 static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
819 {
820 u32 mkey;
821
822 cancel_delayed_work(&ent->dwork);
823 xa_lock_irq(&ent->mkeys);
824 while (ent->stored) {
825 mkey = pop_stored_mkey(ent);
826 xa_unlock_irq(&ent->mkeys);
827 mlx5_core_destroy_mkey(dev->mdev, mkey);
828 xa_lock_irq(&ent->mkeys);
829 }
830 xa_unlock_irq(&ent->mkeys);
831 }
832
mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev * dev)833 static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
834 {
835 if (!mlx5_debugfs_root || dev->is_rep)
836 return;
837
838 debugfs_remove_recursive(dev->cache.fs_root);
839 dev->cache.fs_root = NULL;
840 }
841
mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent)842 static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev,
843 struct mlx5_cache_ent *ent)
844 {
845 int order = order_base_2(ent->rb_key.ndescs);
846 struct dentry *dir;
847
848 if (!mlx5_debugfs_root || dev->is_rep)
849 return;
850
851 if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
852 order = MLX5_IMR_KSM_CACHE_ENTRY + 2;
853
854 sprintf(ent->name, "%d", order);
855 dir = debugfs_create_dir(ent->name, dev->cache.fs_root);
856 debugfs_create_file("size", 0600, dir, ent, &size_fops);
857 debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
858 debugfs_create_ulong("cur", 0400, dir, &ent->stored);
859 debugfs_create_u32("miss", 0600, dir, &ent->miss);
860 }
861
mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev * dev)862 static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
863 {
864 struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev);
865 struct mlx5_mkey_cache *cache = &dev->cache;
866
867 if (!mlx5_debugfs_root || dev->is_rep)
868 return;
869
870 cache->fs_root = debugfs_create_dir("mr_cache", dbg_root);
871 }
872
delay_time_func(struct timer_list * t)873 static void delay_time_func(struct timer_list *t)
874 {
875 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
876
877 WRITE_ONCE(dev->fill_delay, 0);
878 }
879
880 struct mlx5_cache_ent *
mlx5r_cache_create_ent_locked(struct mlx5_ib_dev * dev,struct mlx5r_cache_rb_key rb_key,bool persistent_entry)881 mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
882 struct mlx5r_cache_rb_key rb_key,
883 bool persistent_entry)
884 {
885 struct mlx5_cache_ent *ent;
886 int order;
887 int ret;
888
889 ent = kzalloc(sizeof(*ent), GFP_KERNEL);
890 if (!ent)
891 return ERR_PTR(-ENOMEM);
892
893 xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
894 ent->rb_key = rb_key;
895 ent->dev = dev;
896 ent->is_tmp = !persistent_entry;
897
898 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
899
900 ret = mlx5_cache_ent_insert(&dev->cache, ent);
901 if (ret) {
902 kfree(ent);
903 return ERR_PTR(ret);
904 }
905
906 if (persistent_entry) {
907 if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
908 order = MLX5_IMR_KSM_CACHE_ENTRY;
909 else
910 order = order_base_2(rb_key.ndescs) - 2;
911
912 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
913 !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
914 mlx5r_umr_can_load_pas(dev, 0))
915 ent->limit = dev->mdev->profile.mr_cache[order].limit;
916 else
917 ent->limit = 0;
918
919 mlx5_mkey_cache_debugfs_add_ent(dev, ent);
920 } else {
921 mod_delayed_work(ent->dev->cache.wq,
922 &ent->dev->cache.remove_ent_dwork,
923 msecs_to_jiffies(30 * 1000));
924 }
925
926 return ent;
927 }
928
remove_ent_work_func(struct work_struct * work)929 static void remove_ent_work_func(struct work_struct *work)
930 {
931 struct mlx5_mkey_cache *cache;
932 struct mlx5_cache_ent *ent;
933 struct rb_node *cur;
934
935 cache = container_of(work, struct mlx5_mkey_cache,
936 remove_ent_dwork.work);
937 mutex_lock(&cache->rb_lock);
938 cur = rb_last(&cache->rb_root);
939 while (cur) {
940 ent = rb_entry(cur, struct mlx5_cache_ent, node);
941 cur = rb_prev(cur);
942 mutex_unlock(&cache->rb_lock);
943
944 xa_lock_irq(&ent->mkeys);
945 if (!ent->is_tmp) {
946 xa_unlock_irq(&ent->mkeys);
947 mutex_lock(&cache->rb_lock);
948 continue;
949 }
950 xa_unlock_irq(&ent->mkeys);
951
952 clean_keys(ent->dev, ent);
953 mutex_lock(&cache->rb_lock);
954 }
955 mutex_unlock(&cache->rb_lock);
956 }
957
mlx5_mkey_cache_init(struct mlx5_ib_dev * dev)958 int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
959 {
960 struct mlx5_mkey_cache *cache = &dev->cache;
961 struct rb_root *root = &dev->cache.rb_root;
962 struct mlx5r_cache_rb_key rb_key = {
963 .access_mode = MLX5_MKC_ACCESS_MODE_MTT,
964 };
965 struct mlx5_cache_ent *ent;
966 struct rb_node *node;
967 int ret;
968 int i;
969
970 mutex_init(&dev->slow_path_mutex);
971 mutex_init(&dev->cache.rb_lock);
972 dev->cache.rb_root = RB_ROOT;
973 INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func);
974 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
975 if (!cache->wq) {
976 mlx5_ib_warn(dev, "failed to create work queue\n");
977 return -ENOMEM;
978 }
979
980 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
981 timer_setup(&dev->delay_timer, delay_time_func, 0);
982 mlx5_mkey_cache_debugfs_init(dev);
983 mutex_lock(&cache->rb_lock);
984 for (i = 0; i <= mkey_cache_max_order(dev); i++) {
985 rb_key.ndescs = 1 << (i + 2);
986 ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
987 if (IS_ERR(ent)) {
988 ret = PTR_ERR(ent);
989 goto err;
990 }
991 }
992
993 ret = mlx5_odp_init_mkey_cache(dev);
994 if (ret)
995 goto err;
996
997 mutex_unlock(&cache->rb_lock);
998 for (node = rb_first(root); node; node = rb_next(node)) {
999 ent = rb_entry(node, struct mlx5_cache_ent, node);
1000 xa_lock_irq(&ent->mkeys);
1001 queue_adjust_cache_locked(ent);
1002 xa_unlock_irq(&ent->mkeys);
1003 }
1004
1005 return 0;
1006
1007 err:
1008 mutex_unlock(&cache->rb_lock);
1009 mlx5_mkey_cache_debugfs_cleanup(dev);
1010 mlx5_ib_warn(dev, "failed to create mkey cache entry\n");
1011 return ret;
1012 }
1013
mlx5_mkey_cache_cleanup(struct mlx5_ib_dev * dev)1014 void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
1015 {
1016 struct rb_root *root = &dev->cache.rb_root;
1017 struct mlx5_cache_ent *ent;
1018 struct rb_node *node;
1019
1020 if (!dev->cache.wq)
1021 return;
1022
1023 cancel_delayed_work_sync(&dev->cache.remove_ent_dwork);
1024 mutex_lock(&dev->cache.rb_lock);
1025 for (node = rb_first(root); node; node = rb_next(node)) {
1026 ent = rb_entry(node, struct mlx5_cache_ent, node);
1027 xa_lock_irq(&ent->mkeys);
1028 ent->disabled = true;
1029 xa_unlock_irq(&ent->mkeys);
1030 cancel_delayed_work_sync(&ent->dwork);
1031 }
1032
1033 mlx5_mkey_cache_debugfs_cleanup(dev);
1034 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
1035
1036 node = rb_first(root);
1037 while (node) {
1038 ent = rb_entry(node, struct mlx5_cache_ent, node);
1039 node = rb_next(node);
1040 clean_keys(dev, ent);
1041 rb_erase(&ent->node, root);
1042 kfree(ent);
1043 }
1044 mutex_unlock(&dev->cache.rb_lock);
1045
1046 destroy_workqueue(dev->cache.wq);
1047 del_timer_sync(&dev->delay_timer);
1048 }
1049
mlx5_ib_get_dma_mr(struct ib_pd * pd,int acc)1050 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
1051 {
1052 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1053 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1054 struct mlx5_ib_mr *mr;
1055 void *mkc;
1056 u32 *in;
1057 int err;
1058
1059 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1060 if (!mr)
1061 return ERR_PTR(-ENOMEM);
1062
1063 in = kzalloc(inlen, GFP_KERNEL);
1064 if (!in) {
1065 err = -ENOMEM;
1066 goto err_free;
1067 }
1068
1069 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1070
1071 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
1072 MLX5_SET(mkc, mkc, length64, 1);
1073 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0,
1074 pd);
1075
1076 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1077 if (err)
1078 goto err_in;
1079
1080 kfree(in);
1081 mr->mmkey.type = MLX5_MKEY_MR;
1082 mr->ibmr.lkey = mr->mmkey.key;
1083 mr->ibmr.rkey = mr->mmkey.key;
1084 mr->umem = NULL;
1085
1086 return &mr->ibmr;
1087
1088 err_in:
1089 kfree(in);
1090
1091 err_free:
1092 kfree(mr);
1093
1094 return ERR_PTR(err);
1095 }
1096
get_octo_len(u64 addr,u64 len,int page_shift)1097 static int get_octo_len(u64 addr, u64 len, int page_shift)
1098 {
1099 u64 page_size = 1ULL << page_shift;
1100 u64 offset;
1101 int npages;
1102
1103 offset = addr & (page_size - 1);
1104 npages = ALIGN(len + offset, page_size) >> page_shift;
1105 return (npages + 1) / 2;
1106 }
1107
mkey_cache_max_order(struct mlx5_ib_dev * dev)1108 static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
1109 {
1110 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
1111 return MKEY_CACHE_LAST_STD_ENTRY;
1112 return MLX5_MAX_UMR_SHIFT;
1113 }
1114
set_mr_fields(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr,u64 length,int access_flags,u64 iova)1115 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
1116 u64 length, int access_flags, u64 iova)
1117 {
1118 mr->ibmr.lkey = mr->mmkey.key;
1119 mr->ibmr.rkey = mr->mmkey.key;
1120 mr->ibmr.length = length;
1121 mr->ibmr.device = &dev->ib_dev;
1122 mr->ibmr.iova = iova;
1123 mr->access_flags = access_flags;
1124 }
1125
mlx5_umem_dmabuf_default_pgsz(struct ib_umem * umem,u64 iova)1126 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
1127 u64 iova)
1128 {
1129 /*
1130 * The alignment of iova has already been checked upon entering
1131 * UVERBS_METHOD_REG_DMABUF_MR
1132 */
1133 umem->iova = iova;
1134 return PAGE_SIZE;
1135 }
1136
alloc_cacheable_mr(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags)1137 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
1138 struct ib_umem *umem, u64 iova,
1139 int access_flags)
1140 {
1141 struct mlx5r_cache_rb_key rb_key = {
1142 .access_mode = MLX5_MKC_ACCESS_MODE_MTT,
1143 };
1144 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1145 struct mlx5_cache_ent *ent;
1146 struct mlx5_ib_mr *mr;
1147 unsigned int page_size;
1148
1149 if (umem->is_dmabuf)
1150 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
1151 else
1152 page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size,
1153 0, iova);
1154 if (WARN_ON(!page_size))
1155 return ERR_PTR(-EINVAL);
1156
1157 rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size);
1158 rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
1159 rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
1160 ent = mkey_cache_ent_from_rb_key(dev, rb_key);
1161 /*
1162 * If the MR can't come from the cache then synchronously create an uncached
1163 * one.
1164 */
1165 if (!ent) {
1166 mutex_lock(&dev->slow_path_mutex);
1167 mr = reg_create(pd, umem, iova, access_flags, page_size, false);
1168 mutex_unlock(&dev->slow_path_mutex);
1169 if (IS_ERR(mr))
1170 return mr;
1171 mr->mmkey.rb_key = rb_key;
1172 return mr;
1173 }
1174
1175 mr = _mlx5_mr_cache_alloc(dev, ent, access_flags);
1176 if (IS_ERR(mr))
1177 return mr;
1178
1179 mr->ibmr.pd = pd;
1180 mr->umem = umem;
1181 mr->page_shift = order_base_2(page_size);
1182 set_mr_fields(dev, mr, umem->length, access_flags, iova);
1183
1184 return mr;
1185 }
1186
1187 /*
1188 * If ibmr is NULL it will be allocated by reg_create.
1189 * Else, the given ibmr will be used.
1190 */
reg_create(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags,unsigned int page_size,bool populate)1191 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
1192 u64 iova, int access_flags,
1193 unsigned int page_size, bool populate)
1194 {
1195 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1196 struct mlx5_ib_mr *mr;
1197 __be64 *pas;
1198 void *mkc;
1199 int inlen;
1200 u32 *in;
1201 int err;
1202 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
1203
1204 if (!page_size)
1205 return ERR_PTR(-EINVAL);
1206 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1207 if (!mr)
1208 return ERR_PTR(-ENOMEM);
1209
1210 mr->ibmr.pd = pd;
1211 mr->access_flags = access_flags;
1212 mr->page_shift = order_base_2(page_size);
1213
1214 inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1215 if (populate)
1216 inlen += sizeof(*pas) *
1217 roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
1218 in = kvzalloc(inlen, GFP_KERNEL);
1219 if (!in) {
1220 err = -ENOMEM;
1221 goto err_1;
1222 }
1223 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1224 if (populate) {
1225 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) {
1226 err = -EINVAL;
1227 goto err_2;
1228 }
1229 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
1230 pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1231 }
1232
1233 /* The pg_access bit allows setting the access flags
1234 * in the page list submitted with the command. */
1235 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1236
1237 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1238 set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
1239 populate ? pd : dev->umrc.pd);
1240 MLX5_SET(mkc, mkc, free, !populate);
1241 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
1242 MLX5_SET(mkc, mkc, umr_en, 1);
1243
1244 MLX5_SET64(mkc, mkc, len, umem->length);
1245 MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1246 MLX5_SET(mkc, mkc, translations_octword_size,
1247 get_octo_len(iova, umem->length, mr->page_shift));
1248 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
1249 if (mlx5_umem_needs_ats(dev, umem, access_flags))
1250 MLX5_SET(mkc, mkc, ma_translation_mode, 1);
1251 if (populate) {
1252 MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1253 get_octo_len(iova, umem->length, mr->page_shift));
1254 }
1255
1256 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1257 if (err) {
1258 mlx5_ib_warn(dev, "create mkey failed\n");
1259 goto err_2;
1260 }
1261 mr->mmkey.type = MLX5_MKEY_MR;
1262 mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift);
1263 mr->umem = umem;
1264 set_mr_fields(dev, mr, umem->length, access_flags, iova);
1265 kvfree(in);
1266
1267 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1268
1269 return mr;
1270
1271 err_2:
1272 kvfree(in);
1273 err_1:
1274 kfree(mr);
1275 return ERR_PTR(err);
1276 }
1277
mlx5_ib_get_dm_mr(struct ib_pd * pd,u64 start_addr,u64 length,int acc,int mode)1278 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1279 u64 length, int acc, int mode)
1280 {
1281 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1282 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1283 struct mlx5_ib_mr *mr;
1284 void *mkc;
1285 u32 *in;
1286 int err;
1287
1288 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1289 if (!mr)
1290 return ERR_PTR(-ENOMEM);
1291
1292 in = kzalloc(inlen, GFP_KERNEL);
1293 if (!in) {
1294 err = -ENOMEM;
1295 goto err_free;
1296 }
1297
1298 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1299
1300 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1301 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1302 MLX5_SET64(mkc, mkc, len, length);
1303 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1304
1305 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1306 if (err)
1307 goto err_in;
1308
1309 kfree(in);
1310
1311 set_mr_fields(dev, mr, length, acc, start_addr);
1312
1313 return &mr->ibmr;
1314
1315 err_in:
1316 kfree(in);
1317
1318 err_free:
1319 kfree(mr);
1320
1321 return ERR_PTR(err);
1322 }
1323
mlx5_ib_advise_mr(struct ib_pd * pd,enum ib_uverbs_advise_mr_advice advice,u32 flags,struct ib_sge * sg_list,u32 num_sge,struct uverbs_attr_bundle * attrs)1324 int mlx5_ib_advise_mr(struct ib_pd *pd,
1325 enum ib_uverbs_advise_mr_advice advice,
1326 u32 flags,
1327 struct ib_sge *sg_list,
1328 u32 num_sge,
1329 struct uverbs_attr_bundle *attrs)
1330 {
1331 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1332 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1333 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1334 return -EOPNOTSUPP;
1335
1336 return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1337 sg_list, num_sge);
1338 }
1339
mlx5_ib_reg_dm_mr(struct ib_pd * pd,struct ib_dm * dm,struct ib_dm_mr_attr * attr,struct uverbs_attr_bundle * attrs)1340 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1341 struct ib_dm_mr_attr *attr,
1342 struct uverbs_attr_bundle *attrs)
1343 {
1344 struct mlx5_ib_dm *mdm = to_mdm(dm);
1345 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1346 u64 start_addr = mdm->dev_addr + attr->offset;
1347 int mode;
1348
1349 switch (mdm->type) {
1350 case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1351 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1352 return ERR_PTR(-EINVAL);
1353
1354 mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1355 start_addr -= pci_resource_start(dev->pdev, 0);
1356 break;
1357 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1358 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1359 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
1360 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1361 return ERR_PTR(-EINVAL);
1362
1363 mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1364 break;
1365 default:
1366 return ERR_PTR(-EINVAL);
1367 }
1368
1369 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1370 attr->access_flags, mode);
1371 }
1372
create_real_mr(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags)1373 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
1374 u64 iova, int access_flags)
1375 {
1376 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1377 struct mlx5_ib_mr *mr = NULL;
1378 bool xlt_with_umr;
1379 int err;
1380
1381 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
1382 if (xlt_with_umr) {
1383 mr = alloc_cacheable_mr(pd, umem, iova, access_flags);
1384 } else {
1385 unsigned int page_size = mlx5_umem_find_best_pgsz(
1386 umem, mkc, log_page_size, 0, iova);
1387
1388 mutex_lock(&dev->slow_path_mutex);
1389 mr = reg_create(pd, umem, iova, access_flags, page_size, true);
1390 mutex_unlock(&dev->slow_path_mutex);
1391 }
1392 if (IS_ERR(mr)) {
1393 ib_umem_release(umem);
1394 return ERR_CAST(mr);
1395 }
1396
1397 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1398
1399 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1400
1401 if (xlt_with_umr) {
1402 /*
1403 * If the MR was created with reg_create then it will be
1404 * configured properly but left disabled. It is safe to go ahead
1405 * and configure it again via UMR while enabling it.
1406 */
1407 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
1408 if (err) {
1409 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1410 return ERR_PTR(err);
1411 }
1412 }
1413 return &mr->ibmr;
1414 }
1415
create_user_odp_mr(struct ib_pd * pd,u64 start,u64 length,u64 iova,int access_flags,struct ib_udata * udata)1416 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
1417 u64 iova, int access_flags,
1418 struct ib_udata *udata)
1419 {
1420 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1421 struct ib_umem_odp *odp;
1422 struct mlx5_ib_mr *mr;
1423 int err;
1424
1425 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1426 return ERR_PTR(-EOPNOTSUPP);
1427
1428 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq);
1429 if (err)
1430 return ERR_PTR(err);
1431 if (!start && length == U64_MAX) {
1432 if (iova != 0)
1433 return ERR_PTR(-EINVAL);
1434 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1435 return ERR_PTR(-EINVAL);
1436
1437 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
1438 if (IS_ERR(mr))
1439 return ERR_CAST(mr);
1440 return &mr->ibmr;
1441 }
1442
1443 /* ODP requires xlt update via umr to work. */
1444 if (!mlx5r_umr_can_load_pas(dev, length))
1445 return ERR_PTR(-EINVAL);
1446
1447 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
1448 &mlx5_mn_ops);
1449 if (IS_ERR(odp))
1450 return ERR_CAST(odp);
1451
1452 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags);
1453 if (IS_ERR(mr)) {
1454 ib_umem_release(&odp->umem);
1455 return ERR_CAST(mr);
1456 }
1457 xa_init(&mr->implicit_children);
1458
1459 odp->private = mr;
1460 err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1461 if (err)
1462 goto err_dereg_mr;
1463
1464 err = mlx5_ib_init_odp_mr(mr);
1465 if (err)
1466 goto err_dereg_mr;
1467 return &mr->ibmr;
1468
1469 err_dereg_mr:
1470 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1471 return ERR_PTR(err);
1472 }
1473
mlx5_ib_reg_user_mr(struct ib_pd * pd,u64 start,u64 length,u64 iova,int access_flags,struct ib_udata * udata)1474 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1475 u64 iova, int access_flags,
1476 struct ib_udata *udata)
1477 {
1478 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1479 struct ib_umem *umem;
1480
1481 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1482 return ERR_PTR(-EOPNOTSUPP);
1483
1484 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1485 start, iova, length, access_flags);
1486
1487 if (access_flags & IB_ACCESS_ON_DEMAND)
1488 return create_user_odp_mr(pd, start, length, iova, access_flags,
1489 udata);
1490 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
1491 if (IS_ERR(umem))
1492 return ERR_CAST(umem);
1493 return create_real_mr(pd, umem, iova, access_flags);
1494 }
1495
mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment * attach)1496 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
1497 {
1498 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
1499 struct mlx5_ib_mr *mr = umem_dmabuf->private;
1500
1501 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
1502
1503 if (!umem_dmabuf->sgt)
1504 return;
1505
1506 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
1507 ib_umem_dmabuf_unmap_pages(umem_dmabuf);
1508 }
1509
1510 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
1511 .allow_peer2peer = 1,
1512 .move_notify = mlx5_ib_dmabuf_invalidate_cb,
1513 };
1514
mlx5_ib_reg_user_mr_dmabuf(struct ib_pd * pd,u64 offset,u64 length,u64 virt_addr,int fd,int access_flags,struct ib_udata * udata)1515 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
1516 u64 length, u64 virt_addr,
1517 int fd, int access_flags,
1518 struct ib_udata *udata)
1519 {
1520 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1521 struct mlx5_ib_mr *mr = NULL;
1522 struct ib_umem_dmabuf *umem_dmabuf;
1523 int err;
1524
1525 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
1526 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1527 return ERR_PTR(-EOPNOTSUPP);
1528
1529 mlx5_ib_dbg(dev,
1530 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
1531 offset, virt_addr, length, fd, access_flags);
1532
1533 /* dmabuf requires xlt update via umr to work. */
1534 if (!mlx5r_umr_can_load_pas(dev, length))
1535 return ERR_PTR(-EINVAL);
1536
1537 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd,
1538 access_flags,
1539 &mlx5_ib_dmabuf_attach_ops);
1540 if (IS_ERR(umem_dmabuf)) {
1541 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
1542 PTR_ERR(umem_dmabuf));
1543 return ERR_CAST(umem_dmabuf);
1544 }
1545
1546 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
1547 access_flags);
1548 if (IS_ERR(mr)) {
1549 ib_umem_release(&umem_dmabuf->umem);
1550 return ERR_CAST(mr);
1551 }
1552
1553 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1554
1555 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
1556 umem_dmabuf->private = mr;
1557 err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1558 if (err)
1559 goto err_dereg_mr;
1560
1561 err = mlx5_ib_init_dmabuf_mr(mr);
1562 if (err)
1563 goto err_dereg_mr;
1564 return &mr->ibmr;
1565
1566 err_dereg_mr:
1567 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1568 return ERR_PTR(err);
1569 }
1570
1571 /*
1572 * True if the change in access flags can be done via UMR, only some access
1573 * flags can be updated.
1574 */
can_use_umr_rereg_access(struct mlx5_ib_dev * dev,unsigned int current_access_flags,unsigned int target_access_flags)1575 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
1576 unsigned int current_access_flags,
1577 unsigned int target_access_flags)
1578 {
1579 unsigned int diffs = current_access_flags ^ target_access_flags;
1580
1581 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
1582 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING))
1583 return false;
1584 return mlx5r_umr_can_reconfig(dev, current_access_flags,
1585 target_access_flags);
1586 }
1587
can_use_umr_rereg_pas(struct mlx5_ib_mr * mr,struct ib_umem * new_umem,int new_access_flags,u64 iova,unsigned long * page_size)1588 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
1589 struct ib_umem *new_umem,
1590 int new_access_flags, u64 iova,
1591 unsigned long *page_size)
1592 {
1593 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1594
1595 /* We only track the allocated sizes of MRs from the cache */
1596 if (!mr->mmkey.cache_ent)
1597 return false;
1598 if (!mlx5r_umr_can_load_pas(dev, new_umem->length))
1599 return false;
1600
1601 *page_size =
1602 mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
1603 if (WARN_ON(!*page_size))
1604 return false;
1605 return (mr->mmkey.cache_ent->rb_key.ndescs) >=
1606 ib_umem_num_dma_blocks(new_umem, *page_size);
1607 }
1608
umr_rereg_pas(struct mlx5_ib_mr * mr,struct ib_pd * pd,int access_flags,int flags,struct ib_umem * new_umem,u64 iova,unsigned long page_size)1609 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1610 int access_flags, int flags, struct ib_umem *new_umem,
1611 u64 iova, unsigned long page_size)
1612 {
1613 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1614 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE;
1615 struct ib_umem *old_umem = mr->umem;
1616 int err;
1617
1618 /*
1619 * To keep everything simple the MR is revoked before we start to mess
1620 * with it. This ensure the change is atomic relative to any use of the
1621 * MR.
1622 */
1623 err = mlx5r_umr_revoke_mr(mr);
1624 if (err)
1625 return err;
1626
1627 if (flags & IB_MR_REREG_PD) {
1628 mr->ibmr.pd = pd;
1629 upd_flags |= MLX5_IB_UPD_XLT_PD;
1630 }
1631 if (flags & IB_MR_REREG_ACCESS) {
1632 mr->access_flags = access_flags;
1633 upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1634 }
1635
1636 mr->ibmr.iova = iova;
1637 mr->ibmr.length = new_umem->length;
1638 mr->page_shift = order_base_2(page_size);
1639 mr->umem = new_umem;
1640 err = mlx5r_umr_update_mr_pas(mr, upd_flags);
1641 if (err) {
1642 /*
1643 * The MR is revoked at this point so there is no issue to free
1644 * new_umem.
1645 */
1646 mr->umem = old_umem;
1647 return err;
1648 }
1649
1650 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages);
1651 ib_umem_release(old_umem);
1652 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages);
1653 return 0;
1654 }
1655
mlx5_ib_rereg_user_mr(struct ib_mr * ib_mr,int flags,u64 start,u64 length,u64 iova,int new_access_flags,struct ib_pd * new_pd,struct ib_udata * udata)1656 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1657 u64 length, u64 iova, int new_access_flags,
1658 struct ib_pd *new_pd,
1659 struct ib_udata *udata)
1660 {
1661 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1662 struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1663 int err;
1664
1665 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1666 return ERR_PTR(-EOPNOTSUPP);
1667
1668 mlx5_ib_dbg(
1669 dev,
1670 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1671 start, iova, length, new_access_flags);
1672
1673 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
1674 return ERR_PTR(-EOPNOTSUPP);
1675
1676 if (!(flags & IB_MR_REREG_ACCESS))
1677 new_access_flags = mr->access_flags;
1678 if (!(flags & IB_MR_REREG_PD))
1679 new_pd = ib_mr->pd;
1680
1681 if (!(flags & IB_MR_REREG_TRANS)) {
1682 struct ib_umem *umem;
1683
1684 /* Fast path for PD/access change */
1685 if (can_use_umr_rereg_access(dev, mr->access_flags,
1686 new_access_flags)) {
1687 err = mlx5r_umr_rereg_pd_access(mr, new_pd,
1688 new_access_flags);
1689 if (err)
1690 return ERR_PTR(err);
1691 return NULL;
1692 }
1693 /* DM or ODP MR's don't have a normal umem so we can't re-use it */
1694 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1695 goto recreate;
1696
1697 /*
1698 * Only one active MR can refer to a umem at one time, revoke
1699 * the old MR before assigning the umem to the new one.
1700 */
1701 err = mlx5r_umr_revoke_mr(mr);
1702 if (err)
1703 return ERR_PTR(err);
1704 umem = mr->umem;
1705 mr->umem = NULL;
1706 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1707
1708 return create_real_mr(new_pd, umem, mr->ibmr.iova,
1709 new_access_flags);
1710 }
1711
1712 /*
1713 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
1714 * but the logic around releasing the umem is different
1715 */
1716 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1717 goto recreate;
1718
1719 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
1720 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) {
1721 struct ib_umem *new_umem;
1722 unsigned long page_size;
1723
1724 new_umem = ib_umem_get(&dev->ib_dev, start, length,
1725 new_access_flags);
1726 if (IS_ERR(new_umem))
1727 return ERR_CAST(new_umem);
1728
1729 /* Fast path for PAS change */
1730 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova,
1731 &page_size)) {
1732 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags,
1733 new_umem, iova, page_size);
1734 if (err) {
1735 ib_umem_release(new_umem);
1736 return ERR_PTR(err);
1737 }
1738 return NULL;
1739 }
1740 return create_real_mr(new_pd, new_umem, iova, new_access_flags);
1741 }
1742
1743 /*
1744 * Everything else has no state we can preserve, just create a new MR
1745 * from scratch
1746 */
1747 recreate:
1748 return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
1749 new_access_flags, udata);
1750 }
1751
1752 static int
mlx5_alloc_priv_descs(struct ib_device * device,struct mlx5_ib_mr * mr,int ndescs,int desc_size)1753 mlx5_alloc_priv_descs(struct ib_device *device,
1754 struct mlx5_ib_mr *mr,
1755 int ndescs,
1756 int desc_size)
1757 {
1758 struct mlx5_ib_dev *dev = to_mdev(device);
1759 struct device *ddev = &dev->mdev->pdev->dev;
1760 int size = ndescs * desc_size;
1761 int add_size;
1762 int ret;
1763
1764 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1765
1766 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1767 if (!mr->descs_alloc)
1768 return -ENOMEM;
1769
1770 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1771
1772 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE);
1773 if (dma_mapping_error(ddev, mr->desc_map)) {
1774 ret = -ENOMEM;
1775 goto err;
1776 }
1777
1778 return 0;
1779 err:
1780 kfree(mr->descs_alloc);
1781
1782 return ret;
1783 }
1784
1785 static void
mlx5_free_priv_descs(struct mlx5_ib_mr * mr)1786 mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1787 {
1788 if (!mr->umem && mr->descs) {
1789 struct ib_device *device = mr->ibmr.device;
1790 int size = mr->max_descs * mr->desc_size;
1791 struct mlx5_ib_dev *dev = to_mdev(device);
1792
1793 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
1794 DMA_TO_DEVICE);
1795 kfree(mr->descs_alloc);
1796 mr->descs = NULL;
1797 }
1798 }
1799
cache_ent_find_and_store(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)1800 static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
1801 struct mlx5_ib_mr *mr)
1802 {
1803 struct mlx5_mkey_cache *cache = &dev->cache;
1804 struct mlx5_cache_ent *ent;
1805 int ret;
1806
1807 if (mr->mmkey.cache_ent) {
1808 xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
1809 mr->mmkey.cache_ent->in_use--;
1810 goto end;
1811 }
1812
1813 mutex_lock(&cache->rb_lock);
1814 ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key);
1815 if (ent) {
1816 if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) {
1817 if (ent->disabled) {
1818 mutex_unlock(&cache->rb_lock);
1819 return -EOPNOTSUPP;
1820 }
1821 mr->mmkey.cache_ent = ent;
1822 xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
1823 mutex_unlock(&cache->rb_lock);
1824 goto end;
1825 }
1826 }
1827
1828 ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false);
1829 mutex_unlock(&cache->rb_lock);
1830 if (IS_ERR(ent))
1831 return PTR_ERR(ent);
1832
1833 mr->mmkey.cache_ent = ent;
1834 xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
1835
1836 end:
1837 ret = push_mkey_locked(mr->mmkey.cache_ent, false,
1838 xa_mk_value(mr->mmkey.key));
1839 xa_unlock_irq(&mr->mmkey.cache_ent->mkeys);
1840 return ret;
1841 }
1842
mlx5_ib_dereg_mr(struct ib_mr * ibmr,struct ib_udata * udata)1843 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
1844 {
1845 struct mlx5_ib_mr *mr = to_mmr(ibmr);
1846 struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1847 int rc;
1848
1849 /*
1850 * Any async use of the mr must hold the refcount, once the refcount
1851 * goes to zero no other thread, such as ODP page faults, prefetch, any
1852 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
1853 */
1854 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
1855 refcount_read(&mr->mmkey.usecount) != 0 &&
1856 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)))
1857 mlx5r_deref_wait_odp_mkey(&mr->mmkey);
1858
1859 if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
1860 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
1861 mr->sig, NULL, GFP_KERNEL);
1862
1863 if (mr->mtt_mr) {
1864 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
1865 if (rc)
1866 return rc;
1867 mr->mtt_mr = NULL;
1868 }
1869 if (mr->klm_mr) {
1870 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
1871 if (rc)
1872 return rc;
1873 mr->klm_mr = NULL;
1874 }
1875
1876 if (mlx5_core_destroy_psv(dev->mdev,
1877 mr->sig->psv_memory.psv_idx))
1878 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1879 mr->sig->psv_memory.psv_idx);
1880 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
1881 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1882 mr->sig->psv_wire.psv_idx);
1883 kfree(mr->sig);
1884 mr->sig = NULL;
1885 }
1886
1887 /* Stop DMA */
1888 if (mr->umem && mlx5r_umr_can_load_pas(dev, mr->umem->length))
1889 if (mlx5r_umr_revoke_mr(mr) ||
1890 cache_ent_find_and_store(dev, mr))
1891 mr->mmkey.cache_ent = NULL;
1892
1893 if (!mr->mmkey.cache_ent) {
1894 rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
1895 if (rc)
1896 return rc;
1897 }
1898
1899 if (mr->umem) {
1900 bool is_odp = is_odp_mr(mr);
1901
1902 if (!is_odp)
1903 atomic_sub(ib_umem_num_pages(mr->umem),
1904 &dev->mdev->priv.reg_pages);
1905 ib_umem_release(mr->umem);
1906 if (is_odp)
1907 mlx5_ib_free_odp_mr(mr);
1908 }
1909
1910 if (!mr->mmkey.cache_ent)
1911 mlx5_free_priv_descs(mr);
1912
1913 kfree(mr);
1914 return 0;
1915 }
1916
mlx5_set_umr_free_mkey(struct ib_pd * pd,u32 * in,int ndescs,int access_mode,int page_shift)1917 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
1918 int access_mode, int page_shift)
1919 {
1920 void *mkc;
1921
1922 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1923
1924 /* This is only used from the kernel, so setting the PD is OK. */
1925 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd);
1926 MLX5_SET(mkc, mkc, free, 1);
1927 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1928 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
1929 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
1930 MLX5_SET(mkc, mkc, umr_en, 1);
1931 MLX5_SET(mkc, mkc, log_page_size, page_shift);
1932 }
1933
_mlx5_alloc_mkey_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,int desc_size,int page_shift,int access_mode,u32 * in,int inlen)1934 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1935 int ndescs, int desc_size, int page_shift,
1936 int access_mode, u32 *in, int inlen)
1937 {
1938 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1939 int err;
1940
1941 mr->access_mode = access_mode;
1942 mr->desc_size = desc_size;
1943 mr->max_descs = ndescs;
1944
1945 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
1946 if (err)
1947 return err;
1948
1949 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
1950
1951 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1952 if (err)
1953 goto err_free_descs;
1954
1955 mr->mmkey.type = MLX5_MKEY_MR;
1956 mr->ibmr.lkey = mr->mmkey.key;
1957 mr->ibmr.rkey = mr->mmkey.key;
1958
1959 return 0;
1960
1961 err_free_descs:
1962 mlx5_free_priv_descs(mr);
1963 return err;
1964 }
1965
mlx5_ib_alloc_pi_mr(struct ib_pd * pd,u32 max_num_sg,u32 max_num_meta_sg,int desc_size,int access_mode)1966 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
1967 u32 max_num_sg, u32 max_num_meta_sg,
1968 int desc_size, int access_mode)
1969 {
1970 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1971 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
1972 int page_shift = 0;
1973 struct mlx5_ib_mr *mr;
1974 u32 *in;
1975 int err;
1976
1977 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1978 if (!mr)
1979 return ERR_PTR(-ENOMEM);
1980
1981 mr->ibmr.pd = pd;
1982 mr->ibmr.device = pd->device;
1983
1984 in = kzalloc(inlen, GFP_KERNEL);
1985 if (!in) {
1986 err = -ENOMEM;
1987 goto err_free;
1988 }
1989
1990 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
1991 page_shift = PAGE_SHIFT;
1992
1993 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
1994 access_mode, in, inlen);
1995 if (err)
1996 goto err_free_in;
1997
1998 mr->umem = NULL;
1999 kfree(in);
2000
2001 return mr;
2002
2003 err_free_in:
2004 kfree(in);
2005 err_free:
2006 kfree(mr);
2007 return ERR_PTR(err);
2008 }
2009
mlx5_alloc_mem_reg_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,u32 * in,int inlen)2010 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2011 int ndescs, u32 *in, int inlen)
2012 {
2013 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
2014 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
2015 inlen);
2016 }
2017
mlx5_alloc_sg_gaps_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,u32 * in,int inlen)2018 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2019 int ndescs, u32 *in, int inlen)
2020 {
2021 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
2022 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2023 }
2024
mlx5_alloc_integrity_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int max_num_sg,int max_num_meta_sg,u32 * in,int inlen)2025 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2026 int max_num_sg, int max_num_meta_sg,
2027 u32 *in, int inlen)
2028 {
2029 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2030 u32 psv_index[2];
2031 void *mkc;
2032 int err;
2033
2034 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
2035 if (!mr->sig)
2036 return -ENOMEM;
2037
2038 /* create mem & wire PSVs */
2039 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
2040 if (err)
2041 goto err_free_sig;
2042
2043 mr->sig->psv_memory.psv_idx = psv_index[0];
2044 mr->sig->psv_wire.psv_idx = psv_index[1];
2045
2046 mr->sig->sig_status_checked = true;
2047 mr->sig->sig_err_exists = false;
2048 /* Next UMR, Arm SIGERR */
2049 ++mr->sig->sigerr_count;
2050 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2051 sizeof(struct mlx5_klm),
2052 MLX5_MKC_ACCESS_MODE_KLMS);
2053 if (IS_ERR(mr->klm_mr)) {
2054 err = PTR_ERR(mr->klm_mr);
2055 goto err_destroy_psv;
2056 }
2057 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2058 sizeof(struct mlx5_mtt),
2059 MLX5_MKC_ACCESS_MODE_MTT);
2060 if (IS_ERR(mr->mtt_mr)) {
2061 err = PTR_ERR(mr->mtt_mr);
2062 goto err_free_klm_mr;
2063 }
2064
2065 /* Set bsf descriptors for mkey */
2066 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2067 MLX5_SET(mkc, mkc, bsf_en, 1);
2068 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
2069
2070 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
2071 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2072 if (err)
2073 goto err_free_mtt_mr;
2074
2075 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2076 mr->sig, GFP_KERNEL));
2077 if (err)
2078 goto err_free_descs;
2079 return 0;
2080
2081 err_free_descs:
2082 destroy_mkey(dev, mr);
2083 mlx5_free_priv_descs(mr);
2084 err_free_mtt_mr:
2085 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2086 mr->mtt_mr = NULL;
2087 err_free_klm_mr:
2088 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2089 mr->klm_mr = NULL;
2090 err_destroy_psv:
2091 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
2092 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2093 mr->sig->psv_memory.psv_idx);
2094 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2095 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2096 mr->sig->psv_wire.psv_idx);
2097 err_free_sig:
2098 kfree(mr->sig);
2099
2100 return err;
2101 }
2102
__mlx5_ib_alloc_mr(struct ib_pd * pd,enum ib_mr_type mr_type,u32 max_num_sg,u32 max_num_meta_sg)2103 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
2104 enum ib_mr_type mr_type, u32 max_num_sg,
2105 u32 max_num_meta_sg)
2106 {
2107 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2108 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2109 int ndescs = ALIGN(max_num_sg, 4);
2110 struct mlx5_ib_mr *mr;
2111 u32 *in;
2112 int err;
2113
2114 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2115 if (!mr)
2116 return ERR_PTR(-ENOMEM);
2117
2118 in = kzalloc(inlen, GFP_KERNEL);
2119 if (!in) {
2120 err = -ENOMEM;
2121 goto err_free;
2122 }
2123
2124 mr->ibmr.device = pd->device;
2125 mr->umem = NULL;
2126
2127 switch (mr_type) {
2128 case IB_MR_TYPE_MEM_REG:
2129 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
2130 break;
2131 case IB_MR_TYPE_SG_GAPS:
2132 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
2133 break;
2134 case IB_MR_TYPE_INTEGRITY:
2135 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
2136 max_num_meta_sg, in, inlen);
2137 break;
2138 default:
2139 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
2140 err = -EINVAL;
2141 }
2142
2143 if (err)
2144 goto err_free_in;
2145
2146 kfree(in);
2147
2148 return &mr->ibmr;
2149
2150 err_free_in:
2151 kfree(in);
2152 err_free:
2153 kfree(mr);
2154 return ERR_PTR(err);
2155 }
2156
mlx5_ib_alloc_mr(struct ib_pd * pd,enum ib_mr_type mr_type,u32 max_num_sg)2157 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2158 u32 max_num_sg)
2159 {
2160 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
2161 }
2162
mlx5_ib_alloc_mr_integrity(struct ib_pd * pd,u32 max_num_sg,u32 max_num_meta_sg)2163 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
2164 u32 max_num_sg, u32 max_num_meta_sg)
2165 {
2166 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
2167 max_num_meta_sg);
2168 }
2169
mlx5_ib_alloc_mw(struct ib_mw * ibmw,struct ib_udata * udata)2170 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
2171 {
2172 struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
2173 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2174 struct mlx5_ib_mw *mw = to_mmw(ibmw);
2175 unsigned int ndescs;
2176 u32 *in = NULL;
2177 void *mkc;
2178 int err;
2179 struct mlx5_ib_alloc_mw req = {};
2180 struct {
2181 __u32 comp_mask;
2182 __u32 response_length;
2183 } resp = {};
2184
2185 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
2186 if (err)
2187 return err;
2188
2189 if (req.comp_mask || req.reserved1 || req.reserved2)
2190 return -EOPNOTSUPP;
2191
2192 if (udata->inlen > sizeof(req) &&
2193 !ib_is_udata_cleared(udata, sizeof(req),
2194 udata->inlen - sizeof(req)))
2195 return -EOPNOTSUPP;
2196
2197 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
2198
2199 in = kzalloc(inlen, GFP_KERNEL);
2200 if (!in)
2201 return -ENOMEM;
2202
2203 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2204
2205 MLX5_SET(mkc, mkc, free, 1);
2206 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2207 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
2208 MLX5_SET(mkc, mkc, umr_en, 1);
2209 MLX5_SET(mkc, mkc, lr, 1);
2210 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
2211 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
2212 MLX5_SET(mkc, mkc, qpn, 0xffffff);
2213
2214 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
2215 if (err)
2216 goto free;
2217
2218 mw->mmkey.type = MLX5_MKEY_MW;
2219 ibmw->rkey = mw->mmkey.key;
2220 mw->mmkey.ndescs = ndescs;
2221
2222 resp.response_length =
2223 min(offsetofend(typeof(resp), response_length), udata->outlen);
2224 if (resp.response_length) {
2225 err = ib_copy_to_udata(udata, &resp, resp.response_length);
2226 if (err)
2227 goto free_mkey;
2228 }
2229
2230 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2231 err = mlx5r_store_odp_mkey(dev, &mw->mmkey);
2232 if (err)
2233 goto free_mkey;
2234 }
2235
2236 kfree(in);
2237 return 0;
2238
2239 free_mkey:
2240 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key);
2241 free:
2242 kfree(in);
2243 return err;
2244 }
2245
mlx5_ib_dealloc_mw(struct ib_mw * mw)2246 int mlx5_ib_dealloc_mw(struct ib_mw *mw)
2247 {
2248 struct mlx5_ib_dev *dev = to_mdev(mw->device);
2249 struct mlx5_ib_mw *mmw = to_mmw(mw);
2250
2251 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2252 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)))
2253 /*
2254 * pagefault_single_data_segment() may be accessing mmw
2255 * if the user bound an ODP MR to this MW.
2256 */
2257 mlx5r_deref_wait_odp_mkey(&mmw->mmkey);
2258
2259 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key);
2260 }
2261
mlx5_ib_check_mr_status(struct ib_mr * ibmr,u32 check_mask,struct ib_mr_status * mr_status)2262 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
2263 struct ib_mr_status *mr_status)
2264 {
2265 struct mlx5_ib_mr *mmr = to_mmr(ibmr);
2266 int ret = 0;
2267
2268 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
2269 pr_err("Invalid status check mask\n");
2270 ret = -EINVAL;
2271 goto done;
2272 }
2273
2274 mr_status->fail_status = 0;
2275 if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2276 if (!mmr->sig) {
2277 ret = -EINVAL;
2278 pr_err("signature status check requested on a non-signature enabled MR\n");
2279 goto done;
2280 }
2281
2282 mmr->sig->sig_status_checked = true;
2283 if (!mmr->sig->sig_err_exists)
2284 goto done;
2285
2286 if (ibmr->lkey == mmr->sig->err_item.key)
2287 memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2288 sizeof(mr_status->sig_err));
2289 else {
2290 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2291 mr_status->sig_err.sig_err_offset = 0;
2292 mr_status->sig_err.key = mmr->sig->err_item.key;
2293 }
2294
2295 mmr->sig->sig_err_exists = false;
2296 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2297 }
2298
2299 done:
2300 return ret;
2301 }
2302
2303 static int
mlx5_ib_map_pa_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2304 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2305 int data_sg_nents, unsigned int *data_sg_offset,
2306 struct scatterlist *meta_sg, int meta_sg_nents,
2307 unsigned int *meta_sg_offset)
2308 {
2309 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2310 unsigned int sg_offset = 0;
2311 int n = 0;
2312
2313 mr->meta_length = 0;
2314 if (data_sg_nents == 1) {
2315 n++;
2316 mr->mmkey.ndescs = 1;
2317 if (data_sg_offset)
2318 sg_offset = *data_sg_offset;
2319 mr->data_length = sg_dma_len(data_sg) - sg_offset;
2320 mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2321 if (meta_sg_nents == 1) {
2322 n++;
2323 mr->meta_ndescs = 1;
2324 if (meta_sg_offset)
2325 sg_offset = *meta_sg_offset;
2326 else
2327 sg_offset = 0;
2328 mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2329 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2330 }
2331 ibmr->length = mr->data_length + mr->meta_length;
2332 }
2333
2334 return n;
2335 }
2336
2337 static int
mlx5_ib_sg_to_klms(struct mlx5_ib_mr * mr,struct scatterlist * sgl,unsigned short sg_nents,unsigned int * sg_offset_p,struct scatterlist * meta_sgl,unsigned short meta_sg_nents,unsigned int * meta_sg_offset_p)2338 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2339 struct scatterlist *sgl,
2340 unsigned short sg_nents,
2341 unsigned int *sg_offset_p,
2342 struct scatterlist *meta_sgl,
2343 unsigned short meta_sg_nents,
2344 unsigned int *meta_sg_offset_p)
2345 {
2346 struct scatterlist *sg = sgl;
2347 struct mlx5_klm *klms = mr->descs;
2348 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2349 u32 lkey = mr->ibmr.pd->local_dma_lkey;
2350 int i, j = 0;
2351
2352 mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2353 mr->ibmr.length = 0;
2354
2355 for_each_sg(sgl, sg, sg_nents, i) {
2356 if (unlikely(i >= mr->max_descs))
2357 break;
2358 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2359 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2360 klms[i].key = cpu_to_be32(lkey);
2361 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2362
2363 sg_offset = 0;
2364 }
2365
2366 if (sg_offset_p)
2367 *sg_offset_p = sg_offset;
2368
2369 mr->mmkey.ndescs = i;
2370 mr->data_length = mr->ibmr.length;
2371
2372 if (meta_sg_nents) {
2373 sg = meta_sgl;
2374 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2375 for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2376 if (unlikely(i + j >= mr->max_descs))
2377 break;
2378 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2379 sg_offset);
2380 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2381 sg_offset);
2382 klms[i + j].key = cpu_to_be32(lkey);
2383 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2384
2385 sg_offset = 0;
2386 }
2387 if (meta_sg_offset_p)
2388 *meta_sg_offset_p = sg_offset;
2389
2390 mr->meta_ndescs = j;
2391 mr->meta_length = mr->ibmr.length - mr->data_length;
2392 }
2393
2394 return i + j;
2395 }
2396
mlx5_set_page(struct ib_mr * ibmr,u64 addr)2397 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2398 {
2399 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2400 __be64 *descs;
2401
2402 if (unlikely(mr->mmkey.ndescs == mr->max_descs))
2403 return -ENOMEM;
2404
2405 descs = mr->descs;
2406 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2407
2408 return 0;
2409 }
2410
mlx5_set_page_pi(struct ib_mr * ibmr,u64 addr)2411 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2412 {
2413 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2414 __be64 *descs;
2415
2416 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs))
2417 return -ENOMEM;
2418
2419 descs = mr->descs;
2420 descs[mr->mmkey.ndescs + mr->meta_ndescs++] =
2421 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2422
2423 return 0;
2424 }
2425
2426 static int
mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2427 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2428 int data_sg_nents, unsigned int *data_sg_offset,
2429 struct scatterlist *meta_sg, int meta_sg_nents,
2430 unsigned int *meta_sg_offset)
2431 {
2432 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2433 struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2434 int n;
2435
2436 pi_mr->mmkey.ndescs = 0;
2437 pi_mr->meta_ndescs = 0;
2438 pi_mr->meta_length = 0;
2439
2440 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2441 pi_mr->desc_size * pi_mr->max_descs,
2442 DMA_TO_DEVICE);
2443
2444 pi_mr->ibmr.page_size = ibmr->page_size;
2445 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2446 mlx5_set_page);
2447 if (n != data_sg_nents)
2448 return n;
2449
2450 pi_mr->data_iova = pi_mr->ibmr.iova;
2451 pi_mr->data_length = pi_mr->ibmr.length;
2452 pi_mr->ibmr.length = pi_mr->data_length;
2453 ibmr->length = pi_mr->data_length;
2454
2455 if (meta_sg_nents) {
2456 u64 page_mask = ~((u64)ibmr->page_size - 1);
2457 u64 iova = pi_mr->data_iova;
2458
2459 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2460 meta_sg_offset, mlx5_set_page_pi);
2461
2462 pi_mr->meta_length = pi_mr->ibmr.length;
2463 /*
2464 * PI address for the HW is the offset of the metadata address
2465 * relative to the first data page address.
2466 * It equals to first data page address + size of data pages +
2467 * metadata offset at the first metadata page
2468 */
2469 pi_mr->pi_iova = (iova & page_mask) +
2470 pi_mr->mmkey.ndescs * ibmr->page_size +
2471 (pi_mr->ibmr.iova & ~page_mask);
2472 /*
2473 * In order to use one MTT MR for data and metadata, we register
2474 * also the gaps between the end of the data and the start of
2475 * the metadata (the sig MR will verify that the HW will access
2476 * to right addresses). This mapping is safe because we use
2477 * internal mkey for the registration.
2478 */
2479 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2480 pi_mr->ibmr.iova = iova;
2481 ibmr->length += pi_mr->meta_length;
2482 }
2483
2484 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2485 pi_mr->desc_size * pi_mr->max_descs,
2486 DMA_TO_DEVICE);
2487
2488 return n;
2489 }
2490
2491 static int
mlx5_ib_map_klm_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2492 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2493 int data_sg_nents, unsigned int *data_sg_offset,
2494 struct scatterlist *meta_sg, int meta_sg_nents,
2495 unsigned int *meta_sg_offset)
2496 {
2497 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2498 struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2499 int n;
2500
2501 pi_mr->mmkey.ndescs = 0;
2502 pi_mr->meta_ndescs = 0;
2503 pi_mr->meta_length = 0;
2504
2505 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2506 pi_mr->desc_size * pi_mr->max_descs,
2507 DMA_TO_DEVICE);
2508
2509 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2510 meta_sg, meta_sg_nents, meta_sg_offset);
2511
2512 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2513 pi_mr->desc_size * pi_mr->max_descs,
2514 DMA_TO_DEVICE);
2515
2516 /* This is zero-based memory region */
2517 pi_mr->data_iova = 0;
2518 pi_mr->ibmr.iova = 0;
2519 pi_mr->pi_iova = pi_mr->data_length;
2520 ibmr->length = pi_mr->ibmr.length;
2521
2522 return n;
2523 }
2524
mlx5_ib_map_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2525 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2526 int data_sg_nents, unsigned int *data_sg_offset,
2527 struct scatterlist *meta_sg, int meta_sg_nents,
2528 unsigned int *meta_sg_offset)
2529 {
2530 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2531 struct mlx5_ib_mr *pi_mr = NULL;
2532 int n;
2533
2534 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2535
2536 mr->mmkey.ndescs = 0;
2537 mr->data_length = 0;
2538 mr->data_iova = 0;
2539 mr->meta_ndescs = 0;
2540 mr->pi_iova = 0;
2541 /*
2542 * As a performance optimization, if possible, there is no need to
2543 * perform UMR operation to register the data/metadata buffers.
2544 * First try to map the sg lists to PA descriptors with local_dma_lkey.
2545 * Fallback to UMR only in case of a failure.
2546 */
2547 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2548 data_sg_offset, meta_sg, meta_sg_nents,
2549 meta_sg_offset);
2550 if (n == data_sg_nents + meta_sg_nents)
2551 goto out;
2552 /*
2553 * As a performance optimization, if possible, there is no need to map
2554 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2555 * descriptors and fallback to KLM only in case of a failure.
2556 * It's more efficient for the HW to work with MTT descriptors
2557 * (especially in high load).
2558 * Use KLM (indirect access) only if it's mandatory.
2559 */
2560 pi_mr = mr->mtt_mr;
2561 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2562 data_sg_offset, meta_sg, meta_sg_nents,
2563 meta_sg_offset);
2564 if (n == data_sg_nents + meta_sg_nents)
2565 goto out;
2566
2567 pi_mr = mr->klm_mr;
2568 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2569 data_sg_offset, meta_sg, meta_sg_nents,
2570 meta_sg_offset);
2571 if (unlikely(n != data_sg_nents + meta_sg_nents))
2572 return -ENOMEM;
2573
2574 out:
2575 /* This is zero-based memory region */
2576 ibmr->iova = 0;
2577 mr->pi_mr = pi_mr;
2578 if (pi_mr)
2579 ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2580 else
2581 ibmr->sig_attrs->meta_length = mr->meta_length;
2582
2583 return 0;
2584 }
2585
mlx5_ib_map_mr_sg(struct ib_mr * ibmr,struct scatterlist * sg,int sg_nents,unsigned int * sg_offset)2586 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2587 unsigned int *sg_offset)
2588 {
2589 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2590 int n;
2591
2592 mr->mmkey.ndescs = 0;
2593
2594 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2595 mr->desc_size * mr->max_descs,
2596 DMA_TO_DEVICE);
2597
2598 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2599 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2600 NULL);
2601 else
2602 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2603 mlx5_set_page);
2604
2605 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2606 mr->desc_size * mr->max_descs,
2607 DMA_TO_DEVICE);
2608
2609 return n;
2610 }
2611