1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   *  KVM guest address space mapping code
4   *
5   *    Copyright IBM Corp. 2007, 2020
6   *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
7   *		 David Hildenbrand <david@redhat.com>
8   *		 Janosch Frank <frankja@linux.vnet.ibm.com>
9   */
10  
11  #include <linux/kernel.h>
12  #include <linux/pagewalk.h>
13  #include <linux/swap.h>
14  #include <linux/smp.h>
15  #include <linux/spinlock.h>
16  #include <linux/slab.h>
17  #include <linux/swapops.h>
18  #include <linux/ksm.h>
19  #include <linux/mman.h>
20  #include <linux/pgtable.h>
21  
22  #include <asm/pgalloc.h>
23  #include <asm/gmap.h>
24  #include <asm/tlb.h>
25  
26  #define GMAP_SHADOW_FAKE_TABLE 1ULL
27  
28  /**
29   * gmap_alloc - allocate and initialize a guest address space
30   * @limit: maximum address of the gmap address space
31   *
32   * Returns a guest address space structure.
33   */
gmap_alloc(unsigned long limit)34  static struct gmap *gmap_alloc(unsigned long limit)
35  {
36  	struct gmap *gmap;
37  	struct page *page;
38  	unsigned long *table;
39  	unsigned long etype, atype;
40  
41  	if (limit < _REGION3_SIZE) {
42  		limit = _REGION3_SIZE - 1;
43  		atype = _ASCE_TYPE_SEGMENT;
44  		etype = _SEGMENT_ENTRY_EMPTY;
45  	} else if (limit < _REGION2_SIZE) {
46  		limit = _REGION2_SIZE - 1;
47  		atype = _ASCE_TYPE_REGION3;
48  		etype = _REGION3_ENTRY_EMPTY;
49  	} else if (limit < _REGION1_SIZE) {
50  		limit = _REGION1_SIZE - 1;
51  		atype = _ASCE_TYPE_REGION2;
52  		etype = _REGION2_ENTRY_EMPTY;
53  	} else {
54  		limit = -1UL;
55  		atype = _ASCE_TYPE_REGION1;
56  		etype = _REGION1_ENTRY_EMPTY;
57  	}
58  	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
59  	if (!gmap)
60  		goto out;
61  	INIT_LIST_HEAD(&gmap->crst_list);
62  	INIT_LIST_HEAD(&gmap->children);
63  	INIT_LIST_HEAD(&gmap->pt_list);
64  	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
65  	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
66  	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
67  	spin_lock_init(&gmap->guest_table_lock);
68  	spin_lock_init(&gmap->shadow_lock);
69  	refcount_set(&gmap->ref_count, 1);
70  	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
71  	if (!page)
72  		goto out_free;
73  	page->index = 0;
74  	list_add(&page->lru, &gmap->crst_list);
75  	table = page_to_virt(page);
76  	crst_table_init(table, etype);
77  	gmap->table = table;
78  	gmap->asce = atype | _ASCE_TABLE_LENGTH |
79  		_ASCE_USER_BITS | __pa(table);
80  	gmap->asce_end = limit;
81  	return gmap;
82  
83  out_free:
84  	kfree(gmap);
85  out:
86  	return NULL;
87  }
88  
89  /**
90   * gmap_create - create a guest address space
91   * @mm: pointer to the parent mm_struct
92   * @limit: maximum size of the gmap address space
93   *
94   * Returns a guest address space structure.
95   */
gmap_create(struct mm_struct * mm,unsigned long limit)96  struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
97  {
98  	struct gmap *gmap;
99  	unsigned long gmap_asce;
100  
101  	gmap = gmap_alloc(limit);
102  	if (!gmap)
103  		return NULL;
104  	gmap->mm = mm;
105  	spin_lock(&mm->context.lock);
106  	list_add_rcu(&gmap->list, &mm->context.gmap_list);
107  	if (list_is_singular(&mm->context.gmap_list))
108  		gmap_asce = gmap->asce;
109  	else
110  		gmap_asce = -1UL;
111  	WRITE_ONCE(mm->context.gmap_asce, gmap_asce);
112  	spin_unlock(&mm->context.lock);
113  	return gmap;
114  }
115  EXPORT_SYMBOL_GPL(gmap_create);
116  
gmap_flush_tlb(struct gmap * gmap)117  static void gmap_flush_tlb(struct gmap *gmap)
118  {
119  	if (MACHINE_HAS_IDTE)
120  		__tlb_flush_idte(gmap->asce);
121  	else
122  		__tlb_flush_global();
123  }
124  
gmap_radix_tree_free(struct radix_tree_root * root)125  static void gmap_radix_tree_free(struct radix_tree_root *root)
126  {
127  	struct radix_tree_iter iter;
128  	unsigned long indices[16];
129  	unsigned long index;
130  	void __rcu **slot;
131  	int i, nr;
132  
133  	/* A radix tree is freed by deleting all of its entries */
134  	index = 0;
135  	do {
136  		nr = 0;
137  		radix_tree_for_each_slot(slot, root, &iter, index) {
138  			indices[nr] = iter.index;
139  			if (++nr == 16)
140  				break;
141  		}
142  		for (i = 0; i < nr; i++) {
143  			index = indices[i];
144  			radix_tree_delete(root, index);
145  		}
146  	} while (nr > 0);
147  }
148  
gmap_rmap_radix_tree_free(struct radix_tree_root * root)149  static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
150  {
151  	struct gmap_rmap *rmap, *rnext, *head;
152  	struct radix_tree_iter iter;
153  	unsigned long indices[16];
154  	unsigned long index;
155  	void __rcu **slot;
156  	int i, nr;
157  
158  	/* A radix tree is freed by deleting all of its entries */
159  	index = 0;
160  	do {
161  		nr = 0;
162  		radix_tree_for_each_slot(slot, root, &iter, index) {
163  			indices[nr] = iter.index;
164  			if (++nr == 16)
165  				break;
166  		}
167  		for (i = 0; i < nr; i++) {
168  			index = indices[i];
169  			head = radix_tree_delete(root, index);
170  			gmap_for_each_rmap_safe(rmap, rnext, head)
171  				kfree(rmap);
172  		}
173  	} while (nr > 0);
174  }
175  
176  /**
177   * gmap_free - free a guest address space
178   * @gmap: pointer to the guest address space structure
179   *
180   * No locks required. There are no references to this gmap anymore.
181   */
gmap_free(struct gmap * gmap)182  static void gmap_free(struct gmap *gmap)
183  {
184  	struct page *page, *next;
185  
186  	/* Flush tlb of all gmaps (if not already done for shadows) */
187  	if (!(gmap_is_shadow(gmap) && gmap->removed))
188  		gmap_flush_tlb(gmap);
189  	/* Free all segment & region tables. */
190  	list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
191  		__free_pages(page, CRST_ALLOC_ORDER);
192  	gmap_radix_tree_free(&gmap->guest_to_host);
193  	gmap_radix_tree_free(&gmap->host_to_guest);
194  
195  	/* Free additional data for a shadow gmap */
196  	if (gmap_is_shadow(gmap)) {
197  		/* Free all page tables. */
198  		list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
199  			page_table_free_pgste(page);
200  		gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
201  		/* Release reference to the parent */
202  		gmap_put(gmap->parent);
203  	}
204  
205  	kfree(gmap);
206  }
207  
208  /**
209   * gmap_get - increase reference counter for guest address space
210   * @gmap: pointer to the guest address space structure
211   *
212   * Returns the gmap pointer
213   */
gmap_get(struct gmap * gmap)214  struct gmap *gmap_get(struct gmap *gmap)
215  {
216  	refcount_inc(&gmap->ref_count);
217  	return gmap;
218  }
219  EXPORT_SYMBOL_GPL(gmap_get);
220  
221  /**
222   * gmap_put - decrease reference counter for guest address space
223   * @gmap: pointer to the guest address space structure
224   *
225   * If the reference counter reaches zero the guest address space is freed.
226   */
gmap_put(struct gmap * gmap)227  void gmap_put(struct gmap *gmap)
228  {
229  	if (refcount_dec_and_test(&gmap->ref_count))
230  		gmap_free(gmap);
231  }
232  EXPORT_SYMBOL_GPL(gmap_put);
233  
234  /**
235   * gmap_remove - remove a guest address space but do not free it yet
236   * @gmap: pointer to the guest address space structure
237   */
gmap_remove(struct gmap * gmap)238  void gmap_remove(struct gmap *gmap)
239  {
240  	struct gmap *sg, *next;
241  	unsigned long gmap_asce;
242  
243  	/* Remove all shadow gmaps linked to this gmap */
244  	if (!list_empty(&gmap->children)) {
245  		spin_lock(&gmap->shadow_lock);
246  		list_for_each_entry_safe(sg, next, &gmap->children, list) {
247  			list_del(&sg->list);
248  			gmap_put(sg);
249  		}
250  		spin_unlock(&gmap->shadow_lock);
251  	}
252  	/* Remove gmap from the pre-mm list */
253  	spin_lock(&gmap->mm->context.lock);
254  	list_del_rcu(&gmap->list);
255  	if (list_empty(&gmap->mm->context.gmap_list))
256  		gmap_asce = 0;
257  	else if (list_is_singular(&gmap->mm->context.gmap_list))
258  		gmap_asce = list_first_entry(&gmap->mm->context.gmap_list,
259  					     struct gmap, list)->asce;
260  	else
261  		gmap_asce = -1UL;
262  	WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce);
263  	spin_unlock(&gmap->mm->context.lock);
264  	synchronize_rcu();
265  	/* Put reference */
266  	gmap_put(gmap);
267  }
268  EXPORT_SYMBOL_GPL(gmap_remove);
269  
270  /**
271   * gmap_enable - switch primary space to the guest address space
272   * @gmap: pointer to the guest address space structure
273   */
gmap_enable(struct gmap * gmap)274  void gmap_enable(struct gmap *gmap)
275  {
276  	S390_lowcore.gmap = (unsigned long) gmap;
277  }
278  EXPORT_SYMBOL_GPL(gmap_enable);
279  
280  /**
281   * gmap_disable - switch back to the standard primary address space
282   * @gmap: pointer to the guest address space structure
283   */
gmap_disable(struct gmap * gmap)284  void gmap_disable(struct gmap *gmap)
285  {
286  	S390_lowcore.gmap = 0UL;
287  }
288  EXPORT_SYMBOL_GPL(gmap_disable);
289  
290  /**
291   * gmap_get_enabled - get a pointer to the currently enabled gmap
292   *
293   * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
294   */
gmap_get_enabled(void)295  struct gmap *gmap_get_enabled(void)
296  {
297  	return (struct gmap *) S390_lowcore.gmap;
298  }
299  EXPORT_SYMBOL_GPL(gmap_get_enabled);
300  
301  /*
302   * gmap_alloc_table is assumed to be called with mmap_lock held
303   */
gmap_alloc_table(struct gmap * gmap,unsigned long * table,unsigned long init,unsigned long gaddr)304  static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
305  			    unsigned long init, unsigned long gaddr)
306  {
307  	struct page *page;
308  	unsigned long *new;
309  
310  	/* since we dont free the gmap table until gmap_free we can unlock */
311  	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
312  	if (!page)
313  		return -ENOMEM;
314  	new = page_to_virt(page);
315  	crst_table_init(new, init);
316  	spin_lock(&gmap->guest_table_lock);
317  	if (*table & _REGION_ENTRY_INVALID) {
318  		list_add(&page->lru, &gmap->crst_list);
319  		*table = __pa(new) | _REGION_ENTRY_LENGTH |
320  			(*table & _REGION_ENTRY_TYPE_MASK);
321  		page->index = gaddr;
322  		page = NULL;
323  	}
324  	spin_unlock(&gmap->guest_table_lock);
325  	if (page)
326  		__free_pages(page, CRST_ALLOC_ORDER);
327  	return 0;
328  }
329  
330  /**
331   * __gmap_segment_gaddr - find virtual address from segment pointer
332   * @entry: pointer to a segment table entry in the guest address space
333   *
334   * Returns the virtual address in the guest address space for the segment
335   */
__gmap_segment_gaddr(unsigned long * entry)336  static unsigned long __gmap_segment_gaddr(unsigned long *entry)
337  {
338  	struct page *page;
339  	unsigned long offset;
340  
341  	offset = (unsigned long) entry / sizeof(unsigned long);
342  	offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
343  	page = pmd_pgtable_page((pmd_t *) entry);
344  	return page->index + offset;
345  }
346  
347  /**
348   * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
349   * @gmap: pointer to the guest address space structure
350   * @vmaddr: address in the host process address space
351   *
352   * Returns 1 if a TLB flush is required
353   */
__gmap_unlink_by_vmaddr(struct gmap * gmap,unsigned long vmaddr)354  static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
355  {
356  	unsigned long *entry;
357  	int flush = 0;
358  
359  	BUG_ON(gmap_is_shadow(gmap));
360  	spin_lock(&gmap->guest_table_lock);
361  	entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
362  	if (entry) {
363  		flush = (*entry != _SEGMENT_ENTRY_EMPTY);
364  		*entry = _SEGMENT_ENTRY_EMPTY;
365  	}
366  	spin_unlock(&gmap->guest_table_lock);
367  	return flush;
368  }
369  
370  /**
371   * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
372   * @gmap: pointer to the guest address space structure
373   * @gaddr: address in the guest address space
374   *
375   * Returns 1 if a TLB flush is required
376   */
__gmap_unmap_by_gaddr(struct gmap * gmap,unsigned long gaddr)377  static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
378  {
379  	unsigned long vmaddr;
380  
381  	vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
382  						   gaddr >> PMD_SHIFT);
383  	return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
384  }
385  
386  /**
387   * gmap_unmap_segment - unmap segment from the guest address space
388   * @gmap: pointer to the guest address space structure
389   * @to: address in the guest address space
390   * @len: length of the memory area to unmap
391   *
392   * Returns 0 if the unmap succeeded, -EINVAL if not.
393   */
gmap_unmap_segment(struct gmap * gmap,unsigned long to,unsigned long len)394  int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
395  {
396  	unsigned long off;
397  	int flush;
398  
399  	BUG_ON(gmap_is_shadow(gmap));
400  	if ((to | len) & (PMD_SIZE - 1))
401  		return -EINVAL;
402  	if (len == 0 || to + len < to)
403  		return -EINVAL;
404  
405  	flush = 0;
406  	mmap_write_lock(gmap->mm);
407  	for (off = 0; off < len; off += PMD_SIZE)
408  		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
409  	mmap_write_unlock(gmap->mm);
410  	if (flush)
411  		gmap_flush_tlb(gmap);
412  	return 0;
413  }
414  EXPORT_SYMBOL_GPL(gmap_unmap_segment);
415  
416  /**
417   * gmap_map_segment - map a segment to the guest address space
418   * @gmap: pointer to the guest address space structure
419   * @from: source address in the parent address space
420   * @to: target address in the guest address space
421   * @len: length of the memory area to map
422   *
423   * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
424   */
gmap_map_segment(struct gmap * gmap,unsigned long from,unsigned long to,unsigned long len)425  int gmap_map_segment(struct gmap *gmap, unsigned long from,
426  		     unsigned long to, unsigned long len)
427  {
428  	unsigned long off;
429  	int flush;
430  
431  	BUG_ON(gmap_is_shadow(gmap));
432  	if ((from | to | len) & (PMD_SIZE - 1))
433  		return -EINVAL;
434  	if (len == 0 || from + len < from || to + len < to ||
435  	    from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end)
436  		return -EINVAL;
437  
438  	flush = 0;
439  	mmap_write_lock(gmap->mm);
440  	for (off = 0; off < len; off += PMD_SIZE) {
441  		/* Remove old translation */
442  		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
443  		/* Store new translation */
444  		if (radix_tree_insert(&gmap->guest_to_host,
445  				      (to + off) >> PMD_SHIFT,
446  				      (void *) from + off))
447  			break;
448  	}
449  	mmap_write_unlock(gmap->mm);
450  	if (flush)
451  		gmap_flush_tlb(gmap);
452  	if (off >= len)
453  		return 0;
454  	gmap_unmap_segment(gmap, to, len);
455  	return -ENOMEM;
456  }
457  EXPORT_SYMBOL_GPL(gmap_map_segment);
458  
459  /**
460   * __gmap_translate - translate a guest address to a user space address
461   * @gmap: pointer to guest mapping meta data structure
462   * @gaddr: guest address
463   *
464   * Returns user space address which corresponds to the guest address or
465   * -EFAULT if no such mapping exists.
466   * This function does not establish potentially missing page table entries.
467   * The mmap_lock of the mm that belongs to the address space must be held
468   * when this function gets called.
469   *
470   * Note: Can also be called for shadow gmaps.
471   */
__gmap_translate(struct gmap * gmap,unsigned long gaddr)472  unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
473  {
474  	unsigned long vmaddr;
475  
476  	vmaddr = (unsigned long)
477  		radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
478  	/* Note: guest_to_host is empty for a shadow gmap */
479  	return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
480  }
481  EXPORT_SYMBOL_GPL(__gmap_translate);
482  
483  /**
484   * gmap_translate - translate a guest address to a user space address
485   * @gmap: pointer to guest mapping meta data structure
486   * @gaddr: guest address
487   *
488   * Returns user space address which corresponds to the guest address or
489   * -EFAULT if no such mapping exists.
490   * This function does not establish potentially missing page table entries.
491   */
gmap_translate(struct gmap * gmap,unsigned long gaddr)492  unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
493  {
494  	unsigned long rc;
495  
496  	mmap_read_lock(gmap->mm);
497  	rc = __gmap_translate(gmap, gaddr);
498  	mmap_read_unlock(gmap->mm);
499  	return rc;
500  }
501  EXPORT_SYMBOL_GPL(gmap_translate);
502  
503  /**
504   * gmap_unlink - disconnect a page table from the gmap shadow tables
505   * @mm: pointer to the parent mm_struct
506   * @table: pointer to the host page table
507   * @vmaddr: vm address associated with the host page table
508   */
gmap_unlink(struct mm_struct * mm,unsigned long * table,unsigned long vmaddr)509  void gmap_unlink(struct mm_struct *mm, unsigned long *table,
510  		 unsigned long vmaddr)
511  {
512  	struct gmap *gmap;
513  	int flush;
514  
515  	rcu_read_lock();
516  	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
517  		flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
518  		if (flush)
519  			gmap_flush_tlb(gmap);
520  	}
521  	rcu_read_unlock();
522  }
523  
524  static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
525  			   unsigned long gaddr);
526  
527  /**
528   * __gmap_link - set up shadow page tables to connect a host to a guest address
529   * @gmap: pointer to guest mapping meta data structure
530   * @gaddr: guest address
531   * @vmaddr: vm address
532   *
533   * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
534   * if the vm address is already mapped to a different guest segment.
535   * The mmap_lock of the mm that belongs to the address space must be held
536   * when this function gets called.
537   */
__gmap_link(struct gmap * gmap,unsigned long gaddr,unsigned long vmaddr)538  int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
539  {
540  	struct mm_struct *mm;
541  	unsigned long *table;
542  	spinlock_t *ptl;
543  	pgd_t *pgd;
544  	p4d_t *p4d;
545  	pud_t *pud;
546  	pmd_t *pmd;
547  	u64 unprot;
548  	int rc;
549  
550  	BUG_ON(gmap_is_shadow(gmap));
551  	/* Create higher level tables in the gmap page table */
552  	table = gmap->table;
553  	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
554  		table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
555  		if ((*table & _REGION_ENTRY_INVALID) &&
556  		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
557  				     gaddr & _REGION1_MASK))
558  			return -ENOMEM;
559  		table = __va(*table & _REGION_ENTRY_ORIGIN);
560  	}
561  	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
562  		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
563  		if ((*table & _REGION_ENTRY_INVALID) &&
564  		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
565  				     gaddr & _REGION2_MASK))
566  			return -ENOMEM;
567  		table = __va(*table & _REGION_ENTRY_ORIGIN);
568  	}
569  	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
570  		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
571  		if ((*table & _REGION_ENTRY_INVALID) &&
572  		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
573  				     gaddr & _REGION3_MASK))
574  			return -ENOMEM;
575  		table = __va(*table & _REGION_ENTRY_ORIGIN);
576  	}
577  	table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
578  	/* Walk the parent mm page table */
579  	mm = gmap->mm;
580  	pgd = pgd_offset(mm, vmaddr);
581  	VM_BUG_ON(pgd_none(*pgd));
582  	p4d = p4d_offset(pgd, vmaddr);
583  	VM_BUG_ON(p4d_none(*p4d));
584  	pud = pud_offset(p4d, vmaddr);
585  	VM_BUG_ON(pud_none(*pud));
586  	/* large puds cannot yet be handled */
587  	if (pud_large(*pud))
588  		return -EFAULT;
589  	pmd = pmd_offset(pud, vmaddr);
590  	VM_BUG_ON(pmd_none(*pmd));
591  	/* Are we allowed to use huge pages? */
592  	if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
593  		return -EFAULT;
594  	/* Link gmap segment table entry location to page table. */
595  	rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
596  	if (rc)
597  		return rc;
598  	ptl = pmd_lock(mm, pmd);
599  	spin_lock(&gmap->guest_table_lock);
600  	if (*table == _SEGMENT_ENTRY_EMPTY) {
601  		rc = radix_tree_insert(&gmap->host_to_guest,
602  				       vmaddr >> PMD_SHIFT, table);
603  		if (!rc) {
604  			if (pmd_large(*pmd)) {
605  				*table = (pmd_val(*pmd) &
606  					  _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
607  					| _SEGMENT_ENTRY_GMAP_UC;
608  			} else
609  				*table = pmd_val(*pmd) &
610  					_SEGMENT_ENTRY_HARDWARE_BITS;
611  		}
612  	} else if (*table & _SEGMENT_ENTRY_PROTECT &&
613  		   !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
614  		unprot = (u64)*table;
615  		unprot &= ~_SEGMENT_ENTRY_PROTECT;
616  		unprot |= _SEGMENT_ENTRY_GMAP_UC;
617  		gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr);
618  	}
619  	spin_unlock(&gmap->guest_table_lock);
620  	spin_unlock(ptl);
621  	radix_tree_preload_end();
622  	return rc;
623  }
624  
625  /**
626   * gmap_fault - resolve a fault on a guest address
627   * @gmap: pointer to guest mapping meta data structure
628   * @gaddr: guest address
629   * @fault_flags: flags to pass down to handle_mm_fault()
630   *
631   * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
632   * if the vm address is already mapped to a different guest segment.
633   */
gmap_fault(struct gmap * gmap,unsigned long gaddr,unsigned int fault_flags)634  int gmap_fault(struct gmap *gmap, unsigned long gaddr,
635  	       unsigned int fault_flags)
636  {
637  	unsigned long vmaddr;
638  	int rc;
639  	bool unlocked;
640  
641  	mmap_read_lock(gmap->mm);
642  
643  retry:
644  	unlocked = false;
645  	vmaddr = __gmap_translate(gmap, gaddr);
646  	if (IS_ERR_VALUE(vmaddr)) {
647  		rc = vmaddr;
648  		goto out_up;
649  	}
650  	if (fixup_user_fault(gmap->mm, vmaddr, fault_flags,
651  			     &unlocked)) {
652  		rc = -EFAULT;
653  		goto out_up;
654  	}
655  	/*
656  	 * In the case that fixup_user_fault unlocked the mmap_lock during
657  	 * faultin redo __gmap_translate to not race with a map/unmap_segment.
658  	 */
659  	if (unlocked)
660  		goto retry;
661  
662  	rc = __gmap_link(gmap, gaddr, vmaddr);
663  out_up:
664  	mmap_read_unlock(gmap->mm);
665  	return rc;
666  }
667  EXPORT_SYMBOL_GPL(gmap_fault);
668  
669  /*
670   * this function is assumed to be called with mmap_lock held
671   */
__gmap_zap(struct gmap * gmap,unsigned long gaddr)672  void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
673  {
674  	struct vm_area_struct *vma;
675  	unsigned long vmaddr;
676  	spinlock_t *ptl;
677  	pte_t *ptep;
678  
679  	/* Find the vm address for the guest address */
680  	vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
681  						   gaddr >> PMD_SHIFT);
682  	if (vmaddr) {
683  		vmaddr |= gaddr & ~PMD_MASK;
684  
685  		vma = vma_lookup(gmap->mm, vmaddr);
686  		if (!vma || is_vm_hugetlb_page(vma))
687  			return;
688  
689  		/* Get pointer to the page table entry */
690  		ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
691  		if (likely(ptep)) {
692  			ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
693  			pte_unmap_unlock(ptep, ptl);
694  		}
695  	}
696  }
697  EXPORT_SYMBOL_GPL(__gmap_zap);
698  
gmap_discard(struct gmap * gmap,unsigned long from,unsigned long to)699  void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
700  {
701  	unsigned long gaddr, vmaddr, size;
702  	struct vm_area_struct *vma;
703  
704  	mmap_read_lock(gmap->mm);
705  	for (gaddr = from; gaddr < to;
706  	     gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
707  		/* Find the vm address for the guest address */
708  		vmaddr = (unsigned long)
709  			radix_tree_lookup(&gmap->guest_to_host,
710  					  gaddr >> PMD_SHIFT);
711  		if (!vmaddr)
712  			continue;
713  		vmaddr |= gaddr & ~PMD_MASK;
714  		/* Find vma in the parent mm */
715  		vma = find_vma(gmap->mm, vmaddr);
716  		if (!vma)
717  			continue;
718  		/*
719  		 * We do not discard pages that are backed by
720  		 * hugetlbfs, so we don't have to refault them.
721  		 */
722  		if (is_vm_hugetlb_page(vma))
723  			continue;
724  		size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
725  		zap_page_range_single(vma, vmaddr, size, NULL);
726  	}
727  	mmap_read_unlock(gmap->mm);
728  }
729  EXPORT_SYMBOL_GPL(gmap_discard);
730  
731  static LIST_HEAD(gmap_notifier_list);
732  static DEFINE_SPINLOCK(gmap_notifier_lock);
733  
734  /**
735   * gmap_register_pte_notifier - register a pte invalidation callback
736   * @nb: pointer to the gmap notifier block
737   */
gmap_register_pte_notifier(struct gmap_notifier * nb)738  void gmap_register_pte_notifier(struct gmap_notifier *nb)
739  {
740  	spin_lock(&gmap_notifier_lock);
741  	list_add_rcu(&nb->list, &gmap_notifier_list);
742  	spin_unlock(&gmap_notifier_lock);
743  }
744  EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
745  
746  /**
747   * gmap_unregister_pte_notifier - remove a pte invalidation callback
748   * @nb: pointer to the gmap notifier block
749   */
gmap_unregister_pte_notifier(struct gmap_notifier * nb)750  void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
751  {
752  	spin_lock(&gmap_notifier_lock);
753  	list_del_rcu(&nb->list);
754  	spin_unlock(&gmap_notifier_lock);
755  	synchronize_rcu();
756  }
757  EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
758  
759  /**
760   * gmap_call_notifier - call all registered invalidation callbacks
761   * @gmap: pointer to guest mapping meta data structure
762   * @start: start virtual address in the guest address space
763   * @end: end virtual address in the guest address space
764   */
gmap_call_notifier(struct gmap * gmap,unsigned long start,unsigned long end)765  static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
766  			       unsigned long end)
767  {
768  	struct gmap_notifier *nb;
769  
770  	list_for_each_entry(nb, &gmap_notifier_list, list)
771  		nb->notifier_call(gmap, start, end);
772  }
773  
774  /**
775   * gmap_table_walk - walk the gmap page tables
776   * @gmap: pointer to guest mapping meta data structure
777   * @gaddr: virtual address in the guest address space
778   * @level: page table level to stop at
779   *
780   * Returns a table entry pointer for the given guest address and @level
781   * @level=0 : returns a pointer to a page table table entry (or NULL)
782   * @level=1 : returns a pointer to a segment table entry (or NULL)
783   * @level=2 : returns a pointer to a region-3 table entry (or NULL)
784   * @level=3 : returns a pointer to a region-2 table entry (or NULL)
785   * @level=4 : returns a pointer to a region-1 table entry (or NULL)
786   *
787   * Returns NULL if the gmap page tables could not be walked to the
788   * requested level.
789   *
790   * Note: Can also be called for shadow gmaps.
791   */
gmap_table_walk(struct gmap * gmap,unsigned long gaddr,int level)792  static inline unsigned long *gmap_table_walk(struct gmap *gmap,
793  					     unsigned long gaddr, int level)
794  {
795  	const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
796  	unsigned long *table = gmap->table;
797  
798  	if (gmap_is_shadow(gmap) && gmap->removed)
799  		return NULL;
800  
801  	if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
802  		return NULL;
803  
804  	if (asce_type != _ASCE_TYPE_REGION1 &&
805  	    gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
806  		return NULL;
807  
808  	switch (asce_type) {
809  	case _ASCE_TYPE_REGION1:
810  		table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
811  		if (level == 4)
812  			break;
813  		if (*table & _REGION_ENTRY_INVALID)
814  			return NULL;
815  		table = __va(*table & _REGION_ENTRY_ORIGIN);
816  		fallthrough;
817  	case _ASCE_TYPE_REGION2:
818  		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
819  		if (level == 3)
820  			break;
821  		if (*table & _REGION_ENTRY_INVALID)
822  			return NULL;
823  		table = __va(*table & _REGION_ENTRY_ORIGIN);
824  		fallthrough;
825  	case _ASCE_TYPE_REGION3:
826  		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
827  		if (level == 2)
828  			break;
829  		if (*table & _REGION_ENTRY_INVALID)
830  			return NULL;
831  		table = __va(*table & _REGION_ENTRY_ORIGIN);
832  		fallthrough;
833  	case _ASCE_TYPE_SEGMENT:
834  		table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
835  		if (level == 1)
836  			break;
837  		if (*table & _REGION_ENTRY_INVALID)
838  			return NULL;
839  		table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
840  		table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT;
841  	}
842  	return table;
843  }
844  
845  /**
846   * gmap_pte_op_walk - walk the gmap page table, get the page table lock
847   *		      and return the pte pointer
848   * @gmap: pointer to guest mapping meta data structure
849   * @gaddr: virtual address in the guest address space
850   * @ptl: pointer to the spinlock pointer
851   *
852   * Returns a pointer to the locked pte for a guest address, or NULL
853   */
gmap_pte_op_walk(struct gmap * gmap,unsigned long gaddr,spinlock_t ** ptl)854  static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
855  			       spinlock_t **ptl)
856  {
857  	unsigned long *table;
858  
859  	BUG_ON(gmap_is_shadow(gmap));
860  	/* Walk the gmap page table, lock and get pte pointer */
861  	table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
862  	if (!table || *table & _SEGMENT_ENTRY_INVALID)
863  		return NULL;
864  	return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
865  }
866  
867  /**
868   * gmap_pte_op_fixup - force a page in and connect the gmap page table
869   * @gmap: pointer to guest mapping meta data structure
870   * @gaddr: virtual address in the guest address space
871   * @vmaddr: address in the host process address space
872   * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
873   *
874   * Returns 0 if the caller can retry __gmap_translate (might fail again),
875   * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
876   * up or connecting the gmap page table.
877   */
gmap_pte_op_fixup(struct gmap * gmap,unsigned long gaddr,unsigned long vmaddr,int prot)878  static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
879  			     unsigned long vmaddr, int prot)
880  {
881  	struct mm_struct *mm = gmap->mm;
882  	unsigned int fault_flags;
883  	bool unlocked = false;
884  
885  	BUG_ON(gmap_is_shadow(gmap));
886  	fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
887  	if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
888  		return -EFAULT;
889  	if (unlocked)
890  		/* lost mmap_lock, caller has to retry __gmap_translate */
891  		return 0;
892  	/* Connect the page tables */
893  	return __gmap_link(gmap, gaddr, vmaddr);
894  }
895  
896  /**
897   * gmap_pte_op_end - release the page table lock
898   * @ptl: pointer to the spinlock pointer
899   */
gmap_pte_op_end(spinlock_t * ptl)900  static void gmap_pte_op_end(spinlock_t *ptl)
901  {
902  	if (ptl)
903  		spin_unlock(ptl);
904  }
905  
906  /**
907   * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
908   *		      and return the pmd pointer
909   * @gmap: pointer to guest mapping meta data structure
910   * @gaddr: virtual address in the guest address space
911   *
912   * Returns a pointer to the pmd for a guest address, or NULL
913   */
gmap_pmd_op_walk(struct gmap * gmap,unsigned long gaddr)914  static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
915  {
916  	pmd_t *pmdp;
917  
918  	BUG_ON(gmap_is_shadow(gmap));
919  	pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
920  	if (!pmdp)
921  		return NULL;
922  
923  	/* without huge pages, there is no need to take the table lock */
924  	if (!gmap->mm->context.allow_gmap_hpage_1m)
925  		return pmd_none(*pmdp) ? NULL : pmdp;
926  
927  	spin_lock(&gmap->guest_table_lock);
928  	if (pmd_none(*pmdp)) {
929  		spin_unlock(&gmap->guest_table_lock);
930  		return NULL;
931  	}
932  
933  	/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
934  	if (!pmd_large(*pmdp))
935  		spin_unlock(&gmap->guest_table_lock);
936  	return pmdp;
937  }
938  
939  /**
940   * gmap_pmd_op_end - release the guest_table_lock if needed
941   * @gmap: pointer to the guest mapping meta data structure
942   * @pmdp: pointer to the pmd
943   */
gmap_pmd_op_end(struct gmap * gmap,pmd_t * pmdp)944  static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
945  {
946  	if (pmd_large(*pmdp))
947  		spin_unlock(&gmap->guest_table_lock);
948  }
949  
950  /*
951   * gmap_protect_pmd - remove access rights to memory and set pmd notification bits
952   * @pmdp: pointer to the pmd to be protected
953   * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
954   * @bits: notification bits to set
955   *
956   * Returns:
957   * 0 if successfully protected
958   * -EAGAIN if a fixup is needed
959   * -EINVAL if unsupported notifier bits have been specified
960   *
961   * Expected to be called with sg->mm->mmap_lock in read and
962   * guest_table_lock held.
963   */
gmap_protect_pmd(struct gmap * gmap,unsigned long gaddr,pmd_t * pmdp,int prot,unsigned long bits)964  static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
965  			    pmd_t *pmdp, int prot, unsigned long bits)
966  {
967  	int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
968  	int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
969  	pmd_t new = *pmdp;
970  
971  	/* Fixup needed */
972  	if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
973  		return -EAGAIN;
974  
975  	if (prot == PROT_NONE && !pmd_i) {
976  		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
977  		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
978  	}
979  
980  	if (prot == PROT_READ && !pmd_p) {
981  		new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
982  		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT));
983  		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
984  	}
985  
986  	if (bits & GMAP_NOTIFY_MPROT)
987  		set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
988  
989  	/* Shadow GMAP protection needs split PMDs */
990  	if (bits & GMAP_NOTIFY_SHADOW)
991  		return -EINVAL;
992  
993  	return 0;
994  }
995  
996  /*
997   * gmap_protect_pte - remove access rights to memory and set pgste bits
998   * @gmap: pointer to guest mapping meta data structure
999   * @gaddr: virtual address in the guest address space
1000   * @pmdp: pointer to the pmd associated with the pte
1001   * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
1002   * @bits: notification bits to set
1003   *
1004   * Returns 0 if successfully protected, -ENOMEM if out of memory and
1005   * -EAGAIN if a fixup is needed.
1006   *
1007   * Expected to be called with sg->mm->mmap_lock in read
1008   */
gmap_protect_pte(struct gmap * gmap,unsigned long gaddr,pmd_t * pmdp,int prot,unsigned long bits)1009  static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
1010  			    pmd_t *pmdp, int prot, unsigned long bits)
1011  {
1012  	int rc;
1013  	pte_t *ptep;
1014  	spinlock_t *ptl = NULL;
1015  	unsigned long pbits = 0;
1016  
1017  	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
1018  		return -EAGAIN;
1019  
1020  	ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
1021  	if (!ptep)
1022  		return -ENOMEM;
1023  
1024  	pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0;
1025  	pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
1026  	/* Protect and unlock. */
1027  	rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
1028  	gmap_pte_op_end(ptl);
1029  	return rc;
1030  }
1031  
1032  /*
1033   * gmap_protect_range - remove access rights to memory and set pgste bits
1034   * @gmap: pointer to guest mapping meta data structure
1035   * @gaddr: virtual address in the guest address space
1036   * @len: size of area
1037   * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
1038   * @bits: pgste notification bits to set
1039   *
1040   * Returns 0 if successfully protected, -ENOMEM if out of memory and
1041   * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
1042   *
1043   * Called with sg->mm->mmap_lock in read.
1044   */
gmap_protect_range(struct gmap * gmap,unsigned long gaddr,unsigned long len,int prot,unsigned long bits)1045  static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
1046  			      unsigned long len, int prot, unsigned long bits)
1047  {
1048  	unsigned long vmaddr, dist;
1049  	pmd_t *pmdp;
1050  	int rc;
1051  
1052  	BUG_ON(gmap_is_shadow(gmap));
1053  	while (len) {
1054  		rc = -EAGAIN;
1055  		pmdp = gmap_pmd_op_walk(gmap, gaddr);
1056  		if (pmdp) {
1057  			if (!pmd_large(*pmdp)) {
1058  				rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
1059  						      bits);
1060  				if (!rc) {
1061  					len -= PAGE_SIZE;
1062  					gaddr += PAGE_SIZE;
1063  				}
1064  			} else {
1065  				rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot,
1066  						      bits);
1067  				if (!rc) {
1068  					dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK);
1069  					len = len < dist ? 0 : len - dist;
1070  					gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE;
1071  				}
1072  			}
1073  			gmap_pmd_op_end(gmap, pmdp);
1074  		}
1075  		if (rc) {
1076  			if (rc == -EINVAL)
1077  				return rc;
1078  
1079  			/* -EAGAIN, fixup of userspace mm and gmap */
1080  			vmaddr = __gmap_translate(gmap, gaddr);
1081  			if (IS_ERR_VALUE(vmaddr))
1082  				return vmaddr;
1083  			rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
1084  			if (rc)
1085  				return rc;
1086  		}
1087  	}
1088  	return 0;
1089  }
1090  
1091  /**
1092   * gmap_mprotect_notify - change access rights for a range of ptes and
1093   *                        call the notifier if any pte changes again
1094   * @gmap: pointer to guest mapping meta data structure
1095   * @gaddr: virtual address in the guest address space
1096   * @len: size of area
1097   * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
1098   *
1099   * Returns 0 if for each page in the given range a gmap mapping exists,
1100   * the new access rights could be set and the notifier could be armed.
1101   * If the gmap mapping is missing for one or more pages -EFAULT is
1102   * returned. If no memory could be allocated -ENOMEM is returned.
1103   * This function establishes missing page table entries.
1104   */
gmap_mprotect_notify(struct gmap * gmap,unsigned long gaddr,unsigned long len,int prot)1105  int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
1106  			 unsigned long len, int prot)
1107  {
1108  	int rc;
1109  
1110  	if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
1111  		return -EINVAL;
1112  	if (!MACHINE_HAS_ESOP && prot == PROT_READ)
1113  		return -EINVAL;
1114  	mmap_read_lock(gmap->mm);
1115  	rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
1116  	mmap_read_unlock(gmap->mm);
1117  	return rc;
1118  }
1119  EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
1120  
1121  /**
1122   * gmap_read_table - get an unsigned long value from a guest page table using
1123   *                   absolute addressing, without marking the page referenced.
1124   * @gmap: pointer to guest mapping meta data structure
1125   * @gaddr: virtual address in the guest address space
1126   * @val: pointer to the unsigned long value to return
1127   *
1128   * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
1129   * if reading using the virtual address failed. -EINVAL if called on a gmap
1130   * shadow.
1131   *
1132   * Called with gmap->mm->mmap_lock in read.
1133   */
gmap_read_table(struct gmap * gmap,unsigned long gaddr,unsigned long * val)1134  int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
1135  {
1136  	unsigned long address, vmaddr;
1137  	spinlock_t *ptl;
1138  	pte_t *ptep, pte;
1139  	int rc;
1140  
1141  	if (gmap_is_shadow(gmap))
1142  		return -EINVAL;
1143  
1144  	while (1) {
1145  		rc = -EAGAIN;
1146  		ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
1147  		if (ptep) {
1148  			pte = *ptep;
1149  			if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
1150  				address = pte_val(pte) & PAGE_MASK;
1151  				address += gaddr & ~PAGE_MASK;
1152  				*val = *(unsigned long *)__va(address);
1153  				set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
1154  				/* Do *NOT* clear the _PAGE_INVALID bit! */
1155  				rc = 0;
1156  			}
1157  			gmap_pte_op_end(ptl);
1158  		}
1159  		if (!rc)
1160  			break;
1161  		vmaddr = __gmap_translate(gmap, gaddr);
1162  		if (IS_ERR_VALUE(vmaddr)) {
1163  			rc = vmaddr;
1164  			break;
1165  		}
1166  		rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
1167  		if (rc)
1168  			break;
1169  	}
1170  	return rc;
1171  }
1172  EXPORT_SYMBOL_GPL(gmap_read_table);
1173  
1174  /**
1175   * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
1176   * @sg: pointer to the shadow guest address space structure
1177   * @vmaddr: vm address associated with the rmap
1178   * @rmap: pointer to the rmap structure
1179   *
1180   * Called with the sg->guest_table_lock
1181   */
gmap_insert_rmap(struct gmap * sg,unsigned long vmaddr,struct gmap_rmap * rmap)1182  static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
1183  				    struct gmap_rmap *rmap)
1184  {
1185  	struct gmap_rmap *temp;
1186  	void __rcu **slot;
1187  
1188  	BUG_ON(!gmap_is_shadow(sg));
1189  	slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
1190  	if (slot) {
1191  		rmap->next = radix_tree_deref_slot_protected(slot,
1192  							&sg->guest_table_lock);
1193  		for (temp = rmap->next; temp; temp = temp->next) {
1194  			if (temp->raddr == rmap->raddr) {
1195  				kfree(rmap);
1196  				return;
1197  			}
1198  		}
1199  		radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
1200  	} else {
1201  		rmap->next = NULL;
1202  		radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
1203  				  rmap);
1204  	}
1205  }
1206  
1207  /**
1208   * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap
1209   * @sg: pointer to the shadow guest address space structure
1210   * @raddr: rmap address in the shadow gmap
1211   * @paddr: address in the parent guest address space
1212   * @len: length of the memory area to protect
1213   *
1214   * Returns 0 if successfully protected and the rmap was created, -ENOMEM
1215   * if out of memory and -EFAULT if paddr is invalid.
1216   */
gmap_protect_rmap(struct gmap * sg,unsigned long raddr,unsigned long paddr,unsigned long len)1217  static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
1218  			     unsigned long paddr, unsigned long len)
1219  {
1220  	struct gmap *parent;
1221  	struct gmap_rmap *rmap;
1222  	unsigned long vmaddr;
1223  	spinlock_t *ptl;
1224  	pte_t *ptep;
1225  	int rc;
1226  
1227  	BUG_ON(!gmap_is_shadow(sg));
1228  	parent = sg->parent;
1229  	while (len) {
1230  		vmaddr = __gmap_translate(parent, paddr);
1231  		if (IS_ERR_VALUE(vmaddr))
1232  			return vmaddr;
1233  		rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
1234  		if (!rmap)
1235  			return -ENOMEM;
1236  		rmap->raddr = raddr;
1237  		rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
1238  		if (rc) {
1239  			kfree(rmap);
1240  			return rc;
1241  		}
1242  		rc = -EAGAIN;
1243  		ptep = gmap_pte_op_walk(parent, paddr, &ptl);
1244  		if (ptep) {
1245  			spin_lock(&sg->guest_table_lock);
1246  			rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ,
1247  					     PGSTE_VSIE_BIT);
1248  			if (!rc)
1249  				gmap_insert_rmap(sg, vmaddr, rmap);
1250  			spin_unlock(&sg->guest_table_lock);
1251  			gmap_pte_op_end(ptl);
1252  		}
1253  		radix_tree_preload_end();
1254  		if (rc) {
1255  			kfree(rmap);
1256  			rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ);
1257  			if (rc)
1258  				return rc;
1259  			continue;
1260  		}
1261  		paddr += PAGE_SIZE;
1262  		len -= PAGE_SIZE;
1263  	}
1264  	return 0;
1265  }
1266  
1267  #define _SHADOW_RMAP_MASK	0x7
1268  #define _SHADOW_RMAP_REGION1	0x5
1269  #define _SHADOW_RMAP_REGION2	0x4
1270  #define _SHADOW_RMAP_REGION3	0x3
1271  #define _SHADOW_RMAP_SEGMENT	0x2
1272  #define _SHADOW_RMAP_PGTABLE	0x1
1273  
1274  /**
1275   * gmap_idte_one - invalidate a single region or segment table entry
1276   * @asce: region or segment table *origin* + table-type bits
1277   * @vaddr: virtual address to identify the table entry to flush
1278   *
1279   * The invalid bit of a single region or segment table entry is set
1280   * and the associated TLB entries depending on the entry are flushed.
1281   * The table-type of the @asce identifies the portion of the @vaddr
1282   * that is used as the invalidation index.
1283   */
gmap_idte_one(unsigned long asce,unsigned long vaddr)1284  static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
1285  {
1286  	asm volatile(
1287  		"	idte	%0,0,%1"
1288  		: : "a" (asce), "a" (vaddr) : "cc", "memory");
1289  }
1290  
1291  /**
1292   * gmap_unshadow_page - remove a page from a shadow page table
1293   * @sg: pointer to the shadow guest address space structure
1294   * @raddr: rmap address in the shadow guest address space
1295   *
1296   * Called with the sg->guest_table_lock
1297   */
gmap_unshadow_page(struct gmap * sg,unsigned long raddr)1298  static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
1299  {
1300  	unsigned long *table;
1301  
1302  	BUG_ON(!gmap_is_shadow(sg));
1303  	table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
1304  	if (!table || *table & _PAGE_INVALID)
1305  		return;
1306  	gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1);
1307  	ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
1308  }
1309  
1310  /**
1311   * __gmap_unshadow_pgt - remove all entries from a shadow page table
1312   * @sg: pointer to the shadow guest address space structure
1313   * @raddr: rmap address in the shadow guest address space
1314   * @pgt: pointer to the start of a shadow page table
1315   *
1316   * Called with the sg->guest_table_lock
1317   */
__gmap_unshadow_pgt(struct gmap * sg,unsigned long raddr,unsigned long * pgt)1318  static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
1319  				unsigned long *pgt)
1320  {
1321  	int i;
1322  
1323  	BUG_ON(!gmap_is_shadow(sg));
1324  	for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE)
1325  		pgt[i] = _PAGE_INVALID;
1326  }
1327  
1328  /**
1329   * gmap_unshadow_pgt - remove a shadow page table from a segment entry
1330   * @sg: pointer to the shadow guest address space structure
1331   * @raddr: address in the shadow guest address space
1332   *
1333   * Called with the sg->guest_table_lock
1334   */
gmap_unshadow_pgt(struct gmap * sg,unsigned long raddr)1335  static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
1336  {
1337  	unsigned long *ste;
1338  	phys_addr_t sto, pgt;
1339  	struct page *page;
1340  
1341  	BUG_ON(!gmap_is_shadow(sg));
1342  	ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
1343  	if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
1344  		return;
1345  	gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
1346  	sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
1347  	gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
1348  	pgt = *ste & _SEGMENT_ENTRY_ORIGIN;
1349  	*ste = _SEGMENT_ENTRY_EMPTY;
1350  	__gmap_unshadow_pgt(sg, raddr, __va(pgt));
1351  	/* Free page table */
1352  	page = phys_to_page(pgt);
1353  	list_del(&page->lru);
1354  	page_table_free_pgste(page);
1355  }
1356  
1357  /**
1358   * __gmap_unshadow_sgt - remove all entries from a shadow segment table
1359   * @sg: pointer to the shadow guest address space structure
1360   * @raddr: rmap address in the shadow guest address space
1361   * @sgt: pointer to the start of a shadow segment table
1362   *
1363   * Called with the sg->guest_table_lock
1364   */
__gmap_unshadow_sgt(struct gmap * sg,unsigned long raddr,unsigned long * sgt)1365  static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
1366  				unsigned long *sgt)
1367  {
1368  	struct page *page;
1369  	phys_addr_t pgt;
1370  	int i;
1371  
1372  	BUG_ON(!gmap_is_shadow(sg));
1373  	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
1374  		if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
1375  			continue;
1376  		pgt = sgt[i] & _REGION_ENTRY_ORIGIN;
1377  		sgt[i] = _SEGMENT_ENTRY_EMPTY;
1378  		__gmap_unshadow_pgt(sg, raddr, __va(pgt));
1379  		/* Free page table */
1380  		page = phys_to_page(pgt);
1381  		list_del(&page->lru);
1382  		page_table_free_pgste(page);
1383  	}
1384  }
1385  
1386  /**
1387   * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
1388   * @sg: pointer to the shadow guest address space structure
1389   * @raddr: rmap address in the shadow guest address space
1390   *
1391   * Called with the shadow->guest_table_lock
1392   */
gmap_unshadow_sgt(struct gmap * sg,unsigned long raddr)1393  static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
1394  {
1395  	unsigned long r3o, *r3e;
1396  	phys_addr_t sgt;
1397  	struct page *page;
1398  
1399  	BUG_ON(!gmap_is_shadow(sg));
1400  	r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
1401  	if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
1402  		return;
1403  	gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
1404  	r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
1405  	gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr);
1406  	sgt = *r3e & _REGION_ENTRY_ORIGIN;
1407  	*r3e = _REGION3_ENTRY_EMPTY;
1408  	__gmap_unshadow_sgt(sg, raddr, __va(sgt));
1409  	/* Free segment table */
1410  	page = phys_to_page(sgt);
1411  	list_del(&page->lru);
1412  	__free_pages(page, CRST_ALLOC_ORDER);
1413  }
1414  
1415  /**
1416   * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
1417   * @sg: pointer to the shadow guest address space structure
1418   * @raddr: address in the shadow guest address space
1419   * @r3t: pointer to the start of a shadow region-3 table
1420   *
1421   * Called with the sg->guest_table_lock
1422   */
__gmap_unshadow_r3t(struct gmap * sg,unsigned long raddr,unsigned long * r3t)1423  static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
1424  				unsigned long *r3t)
1425  {
1426  	struct page *page;
1427  	phys_addr_t sgt;
1428  	int i;
1429  
1430  	BUG_ON(!gmap_is_shadow(sg));
1431  	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
1432  		if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
1433  			continue;
1434  		sgt = r3t[i] & _REGION_ENTRY_ORIGIN;
1435  		r3t[i] = _REGION3_ENTRY_EMPTY;
1436  		__gmap_unshadow_sgt(sg, raddr, __va(sgt));
1437  		/* Free segment table */
1438  		page = phys_to_page(sgt);
1439  		list_del(&page->lru);
1440  		__free_pages(page, CRST_ALLOC_ORDER);
1441  	}
1442  }
1443  
1444  /**
1445   * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
1446   * @sg: pointer to the shadow guest address space structure
1447   * @raddr: rmap address in the shadow guest address space
1448   *
1449   * Called with the sg->guest_table_lock
1450   */
gmap_unshadow_r3t(struct gmap * sg,unsigned long raddr)1451  static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
1452  {
1453  	unsigned long r2o, *r2e;
1454  	phys_addr_t r3t;
1455  	struct page *page;
1456  
1457  	BUG_ON(!gmap_is_shadow(sg));
1458  	r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
1459  	if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
1460  		return;
1461  	gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
1462  	r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
1463  	gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr);
1464  	r3t = *r2e & _REGION_ENTRY_ORIGIN;
1465  	*r2e = _REGION2_ENTRY_EMPTY;
1466  	__gmap_unshadow_r3t(sg, raddr, __va(r3t));
1467  	/* Free region 3 table */
1468  	page = phys_to_page(r3t);
1469  	list_del(&page->lru);
1470  	__free_pages(page, CRST_ALLOC_ORDER);
1471  }
1472  
1473  /**
1474   * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
1475   * @sg: pointer to the shadow guest address space structure
1476   * @raddr: rmap address in the shadow guest address space
1477   * @r2t: pointer to the start of a shadow region-2 table
1478   *
1479   * Called with the sg->guest_table_lock
1480   */
__gmap_unshadow_r2t(struct gmap * sg,unsigned long raddr,unsigned long * r2t)1481  static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
1482  				unsigned long *r2t)
1483  {
1484  	phys_addr_t r3t;
1485  	struct page *page;
1486  	int i;
1487  
1488  	BUG_ON(!gmap_is_shadow(sg));
1489  	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
1490  		if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
1491  			continue;
1492  		r3t = r2t[i] & _REGION_ENTRY_ORIGIN;
1493  		r2t[i] = _REGION2_ENTRY_EMPTY;
1494  		__gmap_unshadow_r3t(sg, raddr, __va(r3t));
1495  		/* Free region 3 table */
1496  		page = phys_to_page(r3t);
1497  		list_del(&page->lru);
1498  		__free_pages(page, CRST_ALLOC_ORDER);
1499  	}
1500  }
1501  
1502  /**
1503   * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
1504   * @sg: pointer to the shadow guest address space structure
1505   * @raddr: rmap address in the shadow guest address space
1506   *
1507   * Called with the sg->guest_table_lock
1508   */
gmap_unshadow_r2t(struct gmap * sg,unsigned long raddr)1509  static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
1510  {
1511  	unsigned long r1o, *r1e;
1512  	struct page *page;
1513  	phys_addr_t r2t;
1514  
1515  	BUG_ON(!gmap_is_shadow(sg));
1516  	r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
1517  	if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
1518  		return;
1519  	gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
1520  	r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
1521  	gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr);
1522  	r2t = *r1e & _REGION_ENTRY_ORIGIN;
1523  	*r1e = _REGION1_ENTRY_EMPTY;
1524  	__gmap_unshadow_r2t(sg, raddr, __va(r2t));
1525  	/* Free region 2 table */
1526  	page = phys_to_page(r2t);
1527  	list_del(&page->lru);
1528  	__free_pages(page, CRST_ALLOC_ORDER);
1529  }
1530  
1531  /**
1532   * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
1533   * @sg: pointer to the shadow guest address space structure
1534   * @raddr: rmap address in the shadow guest address space
1535   * @r1t: pointer to the start of a shadow region-1 table
1536   *
1537   * Called with the shadow->guest_table_lock
1538   */
__gmap_unshadow_r1t(struct gmap * sg,unsigned long raddr,unsigned long * r1t)1539  static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
1540  				unsigned long *r1t)
1541  {
1542  	unsigned long asce;
1543  	struct page *page;
1544  	phys_addr_t r2t;
1545  	int i;
1546  
1547  	BUG_ON(!gmap_is_shadow(sg));
1548  	asce = __pa(r1t) | _ASCE_TYPE_REGION1;
1549  	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
1550  		if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
1551  			continue;
1552  		r2t = r1t[i] & _REGION_ENTRY_ORIGIN;
1553  		__gmap_unshadow_r2t(sg, raddr, __va(r2t));
1554  		/* Clear entry and flush translation r1t -> r2t */
1555  		gmap_idte_one(asce, raddr);
1556  		r1t[i] = _REGION1_ENTRY_EMPTY;
1557  		/* Free region 2 table */
1558  		page = phys_to_page(r2t);
1559  		list_del(&page->lru);
1560  		__free_pages(page, CRST_ALLOC_ORDER);
1561  	}
1562  }
1563  
1564  /**
1565   * gmap_unshadow - remove a shadow page table completely
1566   * @sg: pointer to the shadow guest address space structure
1567   *
1568   * Called with sg->guest_table_lock
1569   */
gmap_unshadow(struct gmap * sg)1570  static void gmap_unshadow(struct gmap *sg)
1571  {
1572  	unsigned long *table;
1573  
1574  	BUG_ON(!gmap_is_shadow(sg));
1575  	if (sg->removed)
1576  		return;
1577  	sg->removed = 1;
1578  	gmap_call_notifier(sg, 0, -1UL);
1579  	gmap_flush_tlb(sg);
1580  	table = __va(sg->asce & _ASCE_ORIGIN);
1581  	switch (sg->asce & _ASCE_TYPE_MASK) {
1582  	case _ASCE_TYPE_REGION1:
1583  		__gmap_unshadow_r1t(sg, 0, table);
1584  		break;
1585  	case _ASCE_TYPE_REGION2:
1586  		__gmap_unshadow_r2t(sg, 0, table);
1587  		break;
1588  	case _ASCE_TYPE_REGION3:
1589  		__gmap_unshadow_r3t(sg, 0, table);
1590  		break;
1591  	case _ASCE_TYPE_SEGMENT:
1592  		__gmap_unshadow_sgt(sg, 0, table);
1593  		break;
1594  	}
1595  }
1596  
1597  /**
1598   * gmap_find_shadow - find a specific asce in the list of shadow tables
1599   * @parent: pointer to the parent gmap
1600   * @asce: ASCE for which the shadow table is created
1601   * @edat_level: edat level to be used for the shadow translation
1602   *
1603   * Returns the pointer to a gmap if a shadow table with the given asce is
1604   * already available, ERR_PTR(-EAGAIN) if another one is just being created,
1605   * otherwise NULL
1606   */
gmap_find_shadow(struct gmap * parent,unsigned long asce,int edat_level)1607  static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
1608  				     int edat_level)
1609  {
1610  	struct gmap *sg;
1611  
1612  	list_for_each_entry(sg, &parent->children, list) {
1613  		if (sg->orig_asce != asce || sg->edat_level != edat_level ||
1614  		    sg->removed)
1615  			continue;
1616  		if (!sg->initialized)
1617  			return ERR_PTR(-EAGAIN);
1618  		refcount_inc(&sg->ref_count);
1619  		return sg;
1620  	}
1621  	return NULL;
1622  }
1623  
1624  /**
1625   * gmap_shadow_valid - check if a shadow guest address space matches the
1626   *                     given properties and is still valid
1627   * @sg: pointer to the shadow guest address space structure
1628   * @asce: ASCE for which the shadow table is requested
1629   * @edat_level: edat level to be used for the shadow translation
1630   *
1631   * Returns 1 if the gmap shadow is still valid and matches the given
1632   * properties, the caller can continue using it. Returns 0 otherwise, the
1633   * caller has to request a new shadow gmap in this case.
1634   *
1635   */
gmap_shadow_valid(struct gmap * sg,unsigned long asce,int edat_level)1636  int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
1637  {
1638  	if (sg->removed)
1639  		return 0;
1640  	return sg->orig_asce == asce && sg->edat_level == edat_level;
1641  }
1642  EXPORT_SYMBOL_GPL(gmap_shadow_valid);
1643  
1644  /**
1645   * gmap_shadow - create/find a shadow guest address space
1646   * @parent: pointer to the parent gmap
1647   * @asce: ASCE for which the shadow table is created
1648   * @edat_level: edat level to be used for the shadow translation
1649   *
1650   * The pages of the top level page table referred by the asce parameter
1651   * will be set to read-only and marked in the PGSTEs of the kvm process.
1652   * The shadow table will be removed automatically on any change to the
1653   * PTE mapping for the source table.
1654   *
1655   * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
1656   * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
1657   * parent gmap table could not be protected.
1658   */
gmap_shadow(struct gmap * parent,unsigned long asce,int edat_level)1659  struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
1660  			 int edat_level)
1661  {
1662  	struct gmap *sg, *new;
1663  	unsigned long limit;
1664  	int rc;
1665  
1666  	BUG_ON(parent->mm->context.allow_gmap_hpage_1m);
1667  	BUG_ON(gmap_is_shadow(parent));
1668  	spin_lock(&parent->shadow_lock);
1669  	sg = gmap_find_shadow(parent, asce, edat_level);
1670  	spin_unlock(&parent->shadow_lock);
1671  	if (sg)
1672  		return sg;
1673  	/* Create a new shadow gmap */
1674  	limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
1675  	if (asce & _ASCE_REAL_SPACE)
1676  		limit = -1UL;
1677  	new = gmap_alloc(limit);
1678  	if (!new)
1679  		return ERR_PTR(-ENOMEM);
1680  	new->mm = parent->mm;
1681  	new->parent = gmap_get(parent);
1682  	new->orig_asce = asce;
1683  	new->edat_level = edat_level;
1684  	new->initialized = false;
1685  	spin_lock(&parent->shadow_lock);
1686  	/* Recheck if another CPU created the same shadow */
1687  	sg = gmap_find_shadow(parent, asce, edat_level);
1688  	if (sg) {
1689  		spin_unlock(&parent->shadow_lock);
1690  		gmap_free(new);
1691  		return sg;
1692  	}
1693  	if (asce & _ASCE_REAL_SPACE) {
1694  		/* only allow one real-space gmap shadow */
1695  		list_for_each_entry(sg, &parent->children, list) {
1696  			if (sg->orig_asce & _ASCE_REAL_SPACE) {
1697  				spin_lock(&sg->guest_table_lock);
1698  				gmap_unshadow(sg);
1699  				spin_unlock(&sg->guest_table_lock);
1700  				list_del(&sg->list);
1701  				gmap_put(sg);
1702  				break;
1703  			}
1704  		}
1705  	}
1706  	refcount_set(&new->ref_count, 2);
1707  	list_add(&new->list, &parent->children);
1708  	if (asce & _ASCE_REAL_SPACE) {
1709  		/* nothing to protect, return right away */
1710  		new->initialized = true;
1711  		spin_unlock(&parent->shadow_lock);
1712  		return new;
1713  	}
1714  	spin_unlock(&parent->shadow_lock);
1715  	/* protect after insertion, so it will get properly invalidated */
1716  	mmap_read_lock(parent->mm);
1717  	rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
1718  				((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
1719  				PROT_READ, GMAP_NOTIFY_SHADOW);
1720  	mmap_read_unlock(parent->mm);
1721  	spin_lock(&parent->shadow_lock);
1722  	new->initialized = true;
1723  	if (rc) {
1724  		list_del(&new->list);
1725  		gmap_free(new);
1726  		new = ERR_PTR(rc);
1727  	}
1728  	spin_unlock(&parent->shadow_lock);
1729  	return new;
1730  }
1731  EXPORT_SYMBOL_GPL(gmap_shadow);
1732  
1733  /**
1734   * gmap_shadow_r2t - create an empty shadow region 2 table
1735   * @sg: pointer to the shadow guest address space structure
1736   * @saddr: faulting address in the shadow gmap
1737   * @r2t: parent gmap address of the region 2 table to get shadowed
1738   * @fake: r2t references contiguous guest memory block, not a r2t
1739   *
1740   * The r2t parameter specifies the address of the source table. The
1741   * four pages of the source table are made read-only in the parent gmap
1742   * address space. A write to the source table area @r2t will automatically
1743   * remove the shadow r2 table and all of its decendents.
1744   *
1745   * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1746   * shadow table structure is incomplete, -ENOMEM if out of memory and
1747   * -EFAULT if an address in the parent gmap could not be resolved.
1748   *
1749   * Called with sg->mm->mmap_lock in read.
1750   */
gmap_shadow_r2t(struct gmap * sg,unsigned long saddr,unsigned long r2t,int fake)1751  int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
1752  		    int fake)
1753  {
1754  	unsigned long raddr, origin, offset, len;
1755  	unsigned long *table;
1756  	phys_addr_t s_r2t;
1757  	struct page *page;
1758  	int rc;
1759  
1760  	BUG_ON(!gmap_is_shadow(sg));
1761  	/* Allocate a shadow region second table */
1762  	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
1763  	if (!page)
1764  		return -ENOMEM;
1765  	page->index = r2t & _REGION_ENTRY_ORIGIN;
1766  	if (fake)
1767  		page->index |= GMAP_SHADOW_FAKE_TABLE;
1768  	s_r2t = page_to_phys(page);
1769  	/* Install shadow region second table */
1770  	spin_lock(&sg->guest_table_lock);
1771  	table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
1772  	if (!table) {
1773  		rc = -EAGAIN;		/* Race with unshadow */
1774  		goto out_free;
1775  	}
1776  	if (!(*table & _REGION_ENTRY_INVALID)) {
1777  		rc = 0;			/* Already established */
1778  		goto out_free;
1779  	} else if (*table & _REGION_ENTRY_ORIGIN) {
1780  		rc = -EAGAIN;		/* Race with shadow */
1781  		goto out_free;
1782  	}
1783  	crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY);
1784  	/* mark as invalid as long as the parent table is not protected */
1785  	*table = s_r2t | _REGION_ENTRY_LENGTH |
1786  		 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
1787  	if (sg->edat_level >= 1)
1788  		*table |= (r2t & _REGION_ENTRY_PROTECT);
1789  	list_add(&page->lru, &sg->crst_list);
1790  	if (fake) {
1791  		/* nothing to protect for fake tables */
1792  		*table &= ~_REGION_ENTRY_INVALID;
1793  		spin_unlock(&sg->guest_table_lock);
1794  		return 0;
1795  	}
1796  	spin_unlock(&sg->guest_table_lock);
1797  	/* Make r2t read-only in parent gmap page table */
1798  	raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1;
1799  	origin = r2t & _REGION_ENTRY_ORIGIN;
1800  	offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1801  	len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1802  	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1803  	spin_lock(&sg->guest_table_lock);
1804  	if (!rc) {
1805  		table = gmap_table_walk(sg, saddr, 4);
1806  		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t)
1807  			rc = -EAGAIN;		/* Race with unshadow */
1808  		else
1809  			*table &= ~_REGION_ENTRY_INVALID;
1810  	} else {
1811  		gmap_unshadow_r2t(sg, raddr);
1812  	}
1813  	spin_unlock(&sg->guest_table_lock);
1814  	return rc;
1815  out_free:
1816  	spin_unlock(&sg->guest_table_lock);
1817  	__free_pages(page, CRST_ALLOC_ORDER);
1818  	return rc;
1819  }
1820  EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
1821  
1822  /**
1823   * gmap_shadow_r3t - create a shadow region 3 table
1824   * @sg: pointer to the shadow guest address space structure
1825   * @saddr: faulting address in the shadow gmap
1826   * @r3t: parent gmap address of the region 3 table to get shadowed
1827   * @fake: r3t references contiguous guest memory block, not a r3t
1828   *
1829   * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1830   * shadow table structure is incomplete, -ENOMEM if out of memory and
1831   * -EFAULT if an address in the parent gmap could not be resolved.
1832   *
1833   * Called with sg->mm->mmap_lock in read.
1834   */
gmap_shadow_r3t(struct gmap * sg,unsigned long saddr,unsigned long r3t,int fake)1835  int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
1836  		    int fake)
1837  {
1838  	unsigned long raddr, origin, offset, len;
1839  	unsigned long *table;
1840  	phys_addr_t s_r3t;
1841  	struct page *page;
1842  	int rc;
1843  
1844  	BUG_ON(!gmap_is_shadow(sg));
1845  	/* Allocate a shadow region second table */
1846  	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
1847  	if (!page)
1848  		return -ENOMEM;
1849  	page->index = r3t & _REGION_ENTRY_ORIGIN;
1850  	if (fake)
1851  		page->index |= GMAP_SHADOW_FAKE_TABLE;
1852  	s_r3t = page_to_phys(page);
1853  	/* Install shadow region second table */
1854  	spin_lock(&sg->guest_table_lock);
1855  	table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
1856  	if (!table) {
1857  		rc = -EAGAIN;		/* Race with unshadow */
1858  		goto out_free;
1859  	}
1860  	if (!(*table & _REGION_ENTRY_INVALID)) {
1861  		rc = 0;			/* Already established */
1862  		goto out_free;
1863  	} else if (*table & _REGION_ENTRY_ORIGIN) {
1864  		rc = -EAGAIN;		/* Race with shadow */
1865  		goto out_free;
1866  	}
1867  	crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY);
1868  	/* mark as invalid as long as the parent table is not protected */
1869  	*table = s_r3t | _REGION_ENTRY_LENGTH |
1870  		 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
1871  	if (sg->edat_level >= 1)
1872  		*table |= (r3t & _REGION_ENTRY_PROTECT);
1873  	list_add(&page->lru, &sg->crst_list);
1874  	if (fake) {
1875  		/* nothing to protect for fake tables */
1876  		*table &= ~_REGION_ENTRY_INVALID;
1877  		spin_unlock(&sg->guest_table_lock);
1878  		return 0;
1879  	}
1880  	spin_unlock(&sg->guest_table_lock);
1881  	/* Make r3t read-only in parent gmap page table */
1882  	raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2;
1883  	origin = r3t & _REGION_ENTRY_ORIGIN;
1884  	offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1885  	len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1886  	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1887  	spin_lock(&sg->guest_table_lock);
1888  	if (!rc) {
1889  		table = gmap_table_walk(sg, saddr, 3);
1890  		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t)
1891  			rc = -EAGAIN;		/* Race with unshadow */
1892  		else
1893  			*table &= ~_REGION_ENTRY_INVALID;
1894  	} else {
1895  		gmap_unshadow_r3t(sg, raddr);
1896  	}
1897  	spin_unlock(&sg->guest_table_lock);
1898  	return rc;
1899  out_free:
1900  	spin_unlock(&sg->guest_table_lock);
1901  	__free_pages(page, CRST_ALLOC_ORDER);
1902  	return rc;
1903  }
1904  EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
1905  
1906  /**
1907   * gmap_shadow_sgt - create a shadow segment table
1908   * @sg: pointer to the shadow guest address space structure
1909   * @saddr: faulting address in the shadow gmap
1910   * @sgt: parent gmap address of the segment table to get shadowed
1911   * @fake: sgt references contiguous guest memory block, not a sgt
1912   *
1913   * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
1914   * shadow table structure is incomplete, -ENOMEM if out of memory and
1915   * -EFAULT if an address in the parent gmap could not be resolved.
1916   *
1917   * Called with sg->mm->mmap_lock in read.
1918   */
gmap_shadow_sgt(struct gmap * sg,unsigned long saddr,unsigned long sgt,int fake)1919  int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
1920  		    int fake)
1921  {
1922  	unsigned long raddr, origin, offset, len;
1923  	unsigned long *table;
1924  	phys_addr_t s_sgt;
1925  	struct page *page;
1926  	int rc;
1927  
1928  	BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
1929  	/* Allocate a shadow segment table */
1930  	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
1931  	if (!page)
1932  		return -ENOMEM;
1933  	page->index = sgt & _REGION_ENTRY_ORIGIN;
1934  	if (fake)
1935  		page->index |= GMAP_SHADOW_FAKE_TABLE;
1936  	s_sgt = page_to_phys(page);
1937  	/* Install shadow region second table */
1938  	spin_lock(&sg->guest_table_lock);
1939  	table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
1940  	if (!table) {
1941  		rc = -EAGAIN;		/* Race with unshadow */
1942  		goto out_free;
1943  	}
1944  	if (!(*table & _REGION_ENTRY_INVALID)) {
1945  		rc = 0;			/* Already established */
1946  		goto out_free;
1947  	} else if (*table & _REGION_ENTRY_ORIGIN) {
1948  		rc = -EAGAIN;		/* Race with shadow */
1949  		goto out_free;
1950  	}
1951  	crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY);
1952  	/* mark as invalid as long as the parent table is not protected */
1953  	*table = s_sgt | _REGION_ENTRY_LENGTH |
1954  		 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
1955  	if (sg->edat_level >= 1)
1956  		*table |= sgt & _REGION_ENTRY_PROTECT;
1957  	list_add(&page->lru, &sg->crst_list);
1958  	if (fake) {
1959  		/* nothing to protect for fake tables */
1960  		*table &= ~_REGION_ENTRY_INVALID;
1961  		spin_unlock(&sg->guest_table_lock);
1962  		return 0;
1963  	}
1964  	spin_unlock(&sg->guest_table_lock);
1965  	/* Make sgt read-only in parent gmap page table */
1966  	raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3;
1967  	origin = sgt & _REGION_ENTRY_ORIGIN;
1968  	offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1969  	len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1970  	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1971  	spin_lock(&sg->guest_table_lock);
1972  	if (!rc) {
1973  		table = gmap_table_walk(sg, saddr, 2);
1974  		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt)
1975  			rc = -EAGAIN;		/* Race with unshadow */
1976  		else
1977  			*table &= ~_REGION_ENTRY_INVALID;
1978  	} else {
1979  		gmap_unshadow_sgt(sg, raddr);
1980  	}
1981  	spin_unlock(&sg->guest_table_lock);
1982  	return rc;
1983  out_free:
1984  	spin_unlock(&sg->guest_table_lock);
1985  	__free_pages(page, CRST_ALLOC_ORDER);
1986  	return rc;
1987  }
1988  EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
1989  
1990  /**
1991   * gmap_shadow_pgt_lookup - find a shadow page table
1992   * @sg: pointer to the shadow guest address space structure
1993   * @saddr: the address in the shadow aguest address space
1994   * @pgt: parent gmap address of the page table to get shadowed
1995   * @dat_protection: if the pgtable is marked as protected by dat
1996   * @fake: pgt references contiguous guest memory block, not a pgtable
1997   *
1998   * Returns 0 if the shadow page table was found and -EAGAIN if the page
1999   * table was not found.
2000   *
2001   * Called with sg->mm->mmap_lock in read.
2002   */
gmap_shadow_pgt_lookup(struct gmap * sg,unsigned long saddr,unsigned long * pgt,int * dat_protection,int * fake)2003  int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
2004  			   unsigned long *pgt, int *dat_protection,
2005  			   int *fake)
2006  {
2007  	unsigned long *table;
2008  	struct page *page;
2009  	int rc;
2010  
2011  	BUG_ON(!gmap_is_shadow(sg));
2012  	spin_lock(&sg->guest_table_lock);
2013  	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
2014  	if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
2015  		/* Shadow page tables are full pages (pte+pgste) */
2016  		page = pfn_to_page(*table >> PAGE_SHIFT);
2017  		*pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
2018  		*dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
2019  		*fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
2020  		rc = 0;
2021  	} else  {
2022  		rc = -EAGAIN;
2023  	}
2024  	spin_unlock(&sg->guest_table_lock);
2025  	return rc;
2026  
2027  }
2028  EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
2029  
2030  /**
2031   * gmap_shadow_pgt - instantiate a shadow page table
2032   * @sg: pointer to the shadow guest address space structure
2033   * @saddr: faulting address in the shadow gmap
2034   * @pgt: parent gmap address of the page table to get shadowed
2035   * @fake: pgt references contiguous guest memory block, not a pgtable
2036   *
2037   * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
2038   * shadow table structure is incomplete, -ENOMEM if out of memory,
2039   * -EFAULT if an address in the parent gmap could not be resolved and
2040   *
2041   * Called with gmap->mm->mmap_lock in read
2042   */
gmap_shadow_pgt(struct gmap * sg,unsigned long saddr,unsigned long pgt,int fake)2043  int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
2044  		    int fake)
2045  {
2046  	unsigned long raddr, origin;
2047  	unsigned long *table;
2048  	struct page *page;
2049  	phys_addr_t s_pgt;
2050  	int rc;
2051  
2052  	BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
2053  	/* Allocate a shadow page table */
2054  	page = page_table_alloc_pgste(sg->mm);
2055  	if (!page)
2056  		return -ENOMEM;
2057  	page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
2058  	if (fake)
2059  		page->index |= GMAP_SHADOW_FAKE_TABLE;
2060  	s_pgt = page_to_phys(page);
2061  	/* Install shadow page table */
2062  	spin_lock(&sg->guest_table_lock);
2063  	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
2064  	if (!table) {
2065  		rc = -EAGAIN;		/* Race with unshadow */
2066  		goto out_free;
2067  	}
2068  	if (!(*table & _SEGMENT_ENTRY_INVALID)) {
2069  		rc = 0;			/* Already established */
2070  		goto out_free;
2071  	} else if (*table & _SEGMENT_ENTRY_ORIGIN) {
2072  		rc = -EAGAIN;		/* Race with shadow */
2073  		goto out_free;
2074  	}
2075  	/* mark as invalid as long as the parent table is not protected */
2076  	*table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
2077  		 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
2078  	list_add(&page->lru, &sg->pt_list);
2079  	if (fake) {
2080  		/* nothing to protect for fake tables */
2081  		*table &= ~_SEGMENT_ENTRY_INVALID;
2082  		spin_unlock(&sg->guest_table_lock);
2083  		return 0;
2084  	}
2085  	spin_unlock(&sg->guest_table_lock);
2086  	/* Make pgt read-only in parent gmap page table (not the pgste) */
2087  	raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT;
2088  	origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
2089  	rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE);
2090  	spin_lock(&sg->guest_table_lock);
2091  	if (!rc) {
2092  		table = gmap_table_walk(sg, saddr, 1);
2093  		if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt)
2094  			rc = -EAGAIN;		/* Race with unshadow */
2095  		else
2096  			*table &= ~_SEGMENT_ENTRY_INVALID;
2097  	} else {
2098  		gmap_unshadow_pgt(sg, raddr);
2099  	}
2100  	spin_unlock(&sg->guest_table_lock);
2101  	return rc;
2102  out_free:
2103  	spin_unlock(&sg->guest_table_lock);
2104  	page_table_free_pgste(page);
2105  	return rc;
2106  
2107  }
2108  EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
2109  
2110  /**
2111   * gmap_shadow_page - create a shadow page mapping
2112   * @sg: pointer to the shadow guest address space structure
2113   * @saddr: faulting address in the shadow gmap
2114   * @pte: pte in parent gmap address space to get shadowed
2115   *
2116   * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
2117   * shadow table structure is incomplete, -ENOMEM if out of memory and
2118   * -EFAULT if an address in the parent gmap could not be resolved.
2119   *
2120   * Called with sg->mm->mmap_lock in read.
2121   */
gmap_shadow_page(struct gmap * sg,unsigned long saddr,pte_t pte)2122  int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
2123  {
2124  	struct gmap *parent;
2125  	struct gmap_rmap *rmap;
2126  	unsigned long vmaddr, paddr;
2127  	spinlock_t *ptl;
2128  	pte_t *sptep, *tptep;
2129  	int prot;
2130  	int rc;
2131  
2132  	BUG_ON(!gmap_is_shadow(sg));
2133  	parent = sg->parent;
2134  	prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
2135  
2136  	rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
2137  	if (!rmap)
2138  		return -ENOMEM;
2139  	rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
2140  
2141  	while (1) {
2142  		paddr = pte_val(pte) & PAGE_MASK;
2143  		vmaddr = __gmap_translate(parent, paddr);
2144  		if (IS_ERR_VALUE(vmaddr)) {
2145  			rc = vmaddr;
2146  			break;
2147  		}
2148  		rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
2149  		if (rc)
2150  			break;
2151  		rc = -EAGAIN;
2152  		sptep = gmap_pte_op_walk(parent, paddr, &ptl);
2153  		if (sptep) {
2154  			spin_lock(&sg->guest_table_lock);
2155  			/* Get page table pointer */
2156  			tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
2157  			if (!tptep) {
2158  				spin_unlock(&sg->guest_table_lock);
2159  				gmap_pte_op_end(ptl);
2160  				radix_tree_preload_end();
2161  				break;
2162  			}
2163  			rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
2164  			if (rc > 0) {
2165  				/* Success and a new mapping */
2166  				gmap_insert_rmap(sg, vmaddr, rmap);
2167  				rmap = NULL;
2168  				rc = 0;
2169  			}
2170  			gmap_pte_op_end(ptl);
2171  			spin_unlock(&sg->guest_table_lock);
2172  		}
2173  		radix_tree_preload_end();
2174  		if (!rc)
2175  			break;
2176  		rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
2177  		if (rc)
2178  			break;
2179  	}
2180  	kfree(rmap);
2181  	return rc;
2182  }
2183  EXPORT_SYMBOL_GPL(gmap_shadow_page);
2184  
2185  /*
2186   * gmap_shadow_notify - handle notifications for shadow gmap
2187   *
2188   * Called with sg->parent->shadow_lock.
2189   */
gmap_shadow_notify(struct gmap * sg,unsigned long vmaddr,unsigned long gaddr)2190  static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
2191  			       unsigned long gaddr)
2192  {
2193  	struct gmap_rmap *rmap, *rnext, *head;
2194  	unsigned long start, end, bits, raddr;
2195  
2196  	BUG_ON(!gmap_is_shadow(sg));
2197  
2198  	spin_lock(&sg->guest_table_lock);
2199  	if (sg->removed) {
2200  		spin_unlock(&sg->guest_table_lock);
2201  		return;
2202  	}
2203  	/* Check for top level table */
2204  	start = sg->orig_asce & _ASCE_ORIGIN;
2205  	end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE;
2206  	if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
2207  	    gaddr < end) {
2208  		/* The complete shadow table has to go */
2209  		gmap_unshadow(sg);
2210  		spin_unlock(&sg->guest_table_lock);
2211  		list_del(&sg->list);
2212  		gmap_put(sg);
2213  		return;
2214  	}
2215  	/* Remove the page table tree from on specific entry */
2216  	head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
2217  	gmap_for_each_rmap_safe(rmap, rnext, head) {
2218  		bits = rmap->raddr & _SHADOW_RMAP_MASK;
2219  		raddr = rmap->raddr ^ bits;
2220  		switch (bits) {
2221  		case _SHADOW_RMAP_REGION1:
2222  			gmap_unshadow_r2t(sg, raddr);
2223  			break;
2224  		case _SHADOW_RMAP_REGION2:
2225  			gmap_unshadow_r3t(sg, raddr);
2226  			break;
2227  		case _SHADOW_RMAP_REGION3:
2228  			gmap_unshadow_sgt(sg, raddr);
2229  			break;
2230  		case _SHADOW_RMAP_SEGMENT:
2231  			gmap_unshadow_pgt(sg, raddr);
2232  			break;
2233  		case _SHADOW_RMAP_PGTABLE:
2234  			gmap_unshadow_page(sg, raddr);
2235  			break;
2236  		}
2237  		kfree(rmap);
2238  	}
2239  	spin_unlock(&sg->guest_table_lock);
2240  }
2241  
2242  /**
2243   * ptep_notify - call all invalidation callbacks for a specific pte.
2244   * @mm: pointer to the process mm_struct
2245   * @vmaddr: virtual address in the process address space
2246   * @pte: pointer to the page table entry
2247   * @bits: bits from the pgste that caused the notify call
2248   *
2249   * This function is assumed to be called with the page table lock held
2250   * for the pte to notify.
2251   */
ptep_notify(struct mm_struct * mm,unsigned long vmaddr,pte_t * pte,unsigned long bits)2252  void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
2253  		 pte_t *pte, unsigned long bits)
2254  {
2255  	unsigned long offset, gaddr = 0;
2256  	unsigned long *table;
2257  	struct gmap *gmap, *sg, *next;
2258  
2259  	offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
2260  	offset = offset * (PAGE_SIZE / sizeof(pte_t));
2261  	rcu_read_lock();
2262  	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2263  		spin_lock(&gmap->guest_table_lock);
2264  		table = radix_tree_lookup(&gmap->host_to_guest,
2265  					  vmaddr >> PMD_SHIFT);
2266  		if (table)
2267  			gaddr = __gmap_segment_gaddr(table) + offset;
2268  		spin_unlock(&gmap->guest_table_lock);
2269  		if (!table)
2270  			continue;
2271  
2272  		if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
2273  			spin_lock(&gmap->shadow_lock);
2274  			list_for_each_entry_safe(sg, next,
2275  						 &gmap->children, list)
2276  				gmap_shadow_notify(sg, vmaddr, gaddr);
2277  			spin_unlock(&gmap->shadow_lock);
2278  		}
2279  		if (bits & PGSTE_IN_BIT)
2280  			gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
2281  	}
2282  	rcu_read_unlock();
2283  }
2284  EXPORT_SYMBOL_GPL(ptep_notify);
2285  
pmdp_notify_gmap(struct gmap * gmap,pmd_t * pmdp,unsigned long gaddr)2286  static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
2287  			     unsigned long gaddr)
2288  {
2289  	set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
2290  	gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
2291  }
2292  
2293  /**
2294   * gmap_pmdp_xchg - exchange a gmap pmd with another
2295   * @gmap: pointer to the guest address space structure
2296   * @pmdp: pointer to the pmd entry
2297   * @new: replacement entry
2298   * @gaddr: the affected guest address
2299   *
2300   * This function is assumed to be called with the guest_table_lock
2301   * held.
2302   */
gmap_pmdp_xchg(struct gmap * gmap,pmd_t * pmdp,pmd_t new,unsigned long gaddr)2303  static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
2304  			   unsigned long gaddr)
2305  {
2306  	gaddr &= HPAGE_MASK;
2307  	pmdp_notify_gmap(gmap, pmdp, gaddr);
2308  	new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
2309  	if (MACHINE_HAS_TLB_GUEST)
2310  		__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
2311  			    IDTE_GLOBAL);
2312  	else if (MACHINE_HAS_IDTE)
2313  		__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
2314  	else
2315  		__pmdp_csp(pmdp);
2316  	set_pmd(pmdp, new);
2317  }
2318  
gmap_pmdp_clear(struct mm_struct * mm,unsigned long vmaddr,int purge)2319  static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
2320  			    int purge)
2321  {
2322  	pmd_t *pmdp;
2323  	struct gmap *gmap;
2324  	unsigned long gaddr;
2325  
2326  	rcu_read_lock();
2327  	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2328  		spin_lock(&gmap->guest_table_lock);
2329  		pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest,
2330  						  vmaddr >> PMD_SHIFT);
2331  		if (pmdp) {
2332  			gaddr = __gmap_segment_gaddr((unsigned long *)pmdp);
2333  			pmdp_notify_gmap(gmap, pmdp, gaddr);
2334  			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2335  						   _SEGMENT_ENTRY_GMAP_UC));
2336  			if (purge)
2337  				__pmdp_csp(pmdp);
2338  			set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
2339  		}
2340  		spin_unlock(&gmap->guest_table_lock);
2341  	}
2342  	rcu_read_unlock();
2343  }
2344  
2345  /**
2346   * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without
2347   *                        flushing
2348   * @mm: pointer to the process mm_struct
2349   * @vmaddr: virtual address in the process address space
2350   */
gmap_pmdp_invalidate(struct mm_struct * mm,unsigned long vmaddr)2351  void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
2352  {
2353  	gmap_pmdp_clear(mm, vmaddr, 0);
2354  }
2355  EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
2356  
2357  /**
2358   * gmap_pmdp_csp - csp all affected guest pmd entries
2359   * @mm: pointer to the process mm_struct
2360   * @vmaddr: virtual address in the process address space
2361   */
gmap_pmdp_csp(struct mm_struct * mm,unsigned long vmaddr)2362  void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr)
2363  {
2364  	gmap_pmdp_clear(mm, vmaddr, 1);
2365  }
2366  EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
2367  
2368  /**
2369   * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
2370   * @mm: pointer to the process mm_struct
2371   * @vmaddr: virtual address in the process address space
2372   */
gmap_pmdp_idte_local(struct mm_struct * mm,unsigned long vmaddr)2373  void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
2374  {
2375  	unsigned long *entry, gaddr;
2376  	struct gmap *gmap;
2377  	pmd_t *pmdp;
2378  
2379  	rcu_read_lock();
2380  	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2381  		spin_lock(&gmap->guest_table_lock);
2382  		entry = radix_tree_delete(&gmap->host_to_guest,
2383  					  vmaddr >> PMD_SHIFT);
2384  		if (entry) {
2385  			pmdp = (pmd_t *)entry;
2386  			gaddr = __gmap_segment_gaddr(entry);
2387  			pmdp_notify_gmap(gmap, pmdp, gaddr);
2388  			WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2389  					   _SEGMENT_ENTRY_GMAP_UC));
2390  			if (MACHINE_HAS_TLB_GUEST)
2391  				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
2392  					    gmap->asce, IDTE_LOCAL);
2393  			else if (MACHINE_HAS_IDTE)
2394  				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
2395  			*entry = _SEGMENT_ENTRY_EMPTY;
2396  		}
2397  		spin_unlock(&gmap->guest_table_lock);
2398  	}
2399  	rcu_read_unlock();
2400  }
2401  EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
2402  
2403  /**
2404   * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry
2405   * @mm: pointer to the process mm_struct
2406   * @vmaddr: virtual address in the process address space
2407   */
gmap_pmdp_idte_global(struct mm_struct * mm,unsigned long vmaddr)2408  void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
2409  {
2410  	unsigned long *entry, gaddr;
2411  	struct gmap *gmap;
2412  	pmd_t *pmdp;
2413  
2414  	rcu_read_lock();
2415  	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2416  		spin_lock(&gmap->guest_table_lock);
2417  		entry = radix_tree_delete(&gmap->host_to_guest,
2418  					  vmaddr >> PMD_SHIFT);
2419  		if (entry) {
2420  			pmdp = (pmd_t *)entry;
2421  			gaddr = __gmap_segment_gaddr(entry);
2422  			pmdp_notify_gmap(gmap, pmdp, gaddr);
2423  			WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2424  					   _SEGMENT_ENTRY_GMAP_UC));
2425  			if (MACHINE_HAS_TLB_GUEST)
2426  				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
2427  					    gmap->asce, IDTE_GLOBAL);
2428  			else if (MACHINE_HAS_IDTE)
2429  				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
2430  			else
2431  				__pmdp_csp(pmdp);
2432  			*entry = _SEGMENT_ENTRY_EMPTY;
2433  		}
2434  		spin_unlock(&gmap->guest_table_lock);
2435  	}
2436  	rcu_read_unlock();
2437  }
2438  EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
2439  
2440  /**
2441   * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status
2442   * @gmap: pointer to guest address space
2443   * @pmdp: pointer to the pmd to be tested
2444   * @gaddr: virtual address in the guest address space
2445   *
2446   * This function is assumed to be called with the guest_table_lock
2447   * held.
2448   */
gmap_test_and_clear_dirty_pmd(struct gmap * gmap,pmd_t * pmdp,unsigned long gaddr)2449  static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
2450  					  unsigned long gaddr)
2451  {
2452  	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
2453  		return false;
2454  
2455  	/* Already protected memory, which did not change is clean */
2456  	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
2457  	    !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
2458  		return false;
2459  
2460  	/* Clear UC indication and reset protection */
2461  	set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC)));
2462  	gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
2463  	return true;
2464  }
2465  
2466  /**
2467   * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
2468   * @gmap: pointer to guest address space
2469   * @bitmap: dirty bitmap for this pmd
2470   * @gaddr: virtual address in the guest address space
2471   * @vmaddr: virtual address in the host address space
2472   *
2473   * This function is assumed to be called with the guest_table_lock
2474   * held.
2475   */
gmap_sync_dirty_log_pmd(struct gmap * gmap,unsigned long bitmap[4],unsigned long gaddr,unsigned long vmaddr)2476  void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
2477  			     unsigned long gaddr, unsigned long vmaddr)
2478  {
2479  	int i;
2480  	pmd_t *pmdp;
2481  	pte_t *ptep;
2482  	spinlock_t *ptl;
2483  
2484  	pmdp = gmap_pmd_op_walk(gmap, gaddr);
2485  	if (!pmdp)
2486  		return;
2487  
2488  	if (pmd_large(*pmdp)) {
2489  		if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
2490  			bitmap_fill(bitmap, _PAGE_ENTRIES);
2491  	} else {
2492  		for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
2493  			ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl);
2494  			if (!ptep)
2495  				continue;
2496  			if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
2497  				set_bit(i, bitmap);
2498  			spin_unlock(ptl);
2499  		}
2500  	}
2501  	gmap_pmd_op_end(gmap, pmdp);
2502  }
2503  EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
2504  
2505  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
thp_split_walk_pmd_entry(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)2506  static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
2507  				    unsigned long end, struct mm_walk *walk)
2508  {
2509  	struct vm_area_struct *vma = walk->vma;
2510  
2511  	split_huge_pmd(vma, pmd, addr);
2512  	return 0;
2513  }
2514  
2515  static const struct mm_walk_ops thp_split_walk_ops = {
2516  	.pmd_entry	= thp_split_walk_pmd_entry,
2517  };
2518  
thp_split_mm(struct mm_struct * mm)2519  static inline void thp_split_mm(struct mm_struct *mm)
2520  {
2521  	struct vm_area_struct *vma;
2522  	VMA_ITERATOR(vmi, mm, 0);
2523  
2524  	for_each_vma(vmi, vma) {
2525  		vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE);
2526  		walk_page_vma(vma, &thp_split_walk_ops, NULL);
2527  	}
2528  	mm->def_flags |= VM_NOHUGEPAGE;
2529  }
2530  #else
thp_split_mm(struct mm_struct * mm)2531  static inline void thp_split_mm(struct mm_struct *mm)
2532  {
2533  }
2534  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2535  
2536  /*
2537   * Remove all empty zero pages from the mapping for lazy refaulting
2538   * - This must be called after mm->context.has_pgste is set, to avoid
2539   *   future creation of zero pages
2540   * - This must be called after THP was enabled
2541   */
__zap_zero_pages(pmd_t * pmd,unsigned long start,unsigned long end,struct mm_walk * walk)2542  static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
2543  			   unsigned long end, struct mm_walk *walk)
2544  {
2545  	unsigned long addr;
2546  
2547  	for (addr = start; addr != end; addr += PAGE_SIZE) {
2548  		pte_t *ptep;
2549  		spinlock_t *ptl;
2550  
2551  		ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
2552  		if (is_zero_pfn(pte_pfn(*ptep)))
2553  			ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID));
2554  		pte_unmap_unlock(ptep, ptl);
2555  	}
2556  	return 0;
2557  }
2558  
2559  static const struct mm_walk_ops zap_zero_walk_ops = {
2560  	.pmd_entry	= __zap_zero_pages,
2561  };
2562  
2563  /*
2564   * switch on pgstes for its userspace process (for kvm)
2565   */
s390_enable_sie(void)2566  int s390_enable_sie(void)
2567  {
2568  	struct mm_struct *mm = current->mm;
2569  
2570  	/* Do we have pgstes? if yes, we are done */
2571  	if (mm_has_pgste(mm))
2572  		return 0;
2573  	/* Fail if the page tables are 2K */
2574  	if (!mm_alloc_pgste(mm))
2575  		return -EINVAL;
2576  	mmap_write_lock(mm);
2577  	mm->context.has_pgste = 1;
2578  	/* split thp mappings and disable thp for future mappings */
2579  	thp_split_mm(mm);
2580  	walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
2581  	mmap_write_unlock(mm);
2582  	return 0;
2583  }
2584  EXPORT_SYMBOL_GPL(s390_enable_sie);
2585  
gmap_mark_unmergeable(void)2586  int gmap_mark_unmergeable(void)
2587  {
2588  	struct mm_struct *mm = current->mm;
2589  	struct vm_area_struct *vma;
2590  	unsigned long vm_flags;
2591  	int ret;
2592  	VMA_ITERATOR(vmi, mm, 0);
2593  
2594  	for_each_vma(vmi, vma) {
2595  		/* Copy vm_flags to avoid partial modifications in ksm_madvise */
2596  		vm_flags = vma->vm_flags;
2597  		ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
2598  				  MADV_UNMERGEABLE, &vm_flags);
2599  		if (ret)
2600  			return ret;
2601  		vm_flags_reset(vma, vm_flags);
2602  	}
2603  	mm->def_flags &= ~VM_MERGEABLE;
2604  	return 0;
2605  }
2606  EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
2607  
2608  /*
2609   * Enable storage key handling from now on and initialize the storage
2610   * keys with the default key.
2611   */
__s390_enable_skey_pte(pte_t * pte,unsigned long addr,unsigned long next,struct mm_walk * walk)2612  static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
2613  				  unsigned long next, struct mm_walk *walk)
2614  {
2615  	/* Clear storage key */
2616  	ptep_zap_key(walk->mm, addr, pte);
2617  	return 0;
2618  }
2619  
2620  /*
2621   * Give a chance to schedule after setting a key to 256 pages.
2622   * We only hold the mm lock, which is a rwsem and the kvm srcu.
2623   * Both can sleep.
2624   */
__s390_enable_skey_pmd(pmd_t * pmd,unsigned long addr,unsigned long next,struct mm_walk * walk)2625  static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr,
2626  				  unsigned long next, struct mm_walk *walk)
2627  {
2628  	cond_resched();
2629  	return 0;
2630  }
2631  
__s390_enable_skey_hugetlb(pte_t * pte,unsigned long addr,unsigned long hmask,unsigned long next,struct mm_walk * walk)2632  static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
2633  				      unsigned long hmask, unsigned long next,
2634  				      struct mm_walk *walk)
2635  {
2636  	pmd_t *pmd = (pmd_t *)pte;
2637  	unsigned long start, end;
2638  	struct page *page = pmd_page(*pmd);
2639  
2640  	/*
2641  	 * The write check makes sure we do not set a key on shared
2642  	 * memory. This is needed as the walker does not differentiate
2643  	 * between actual guest memory and the process executable or
2644  	 * shared libraries.
2645  	 */
2646  	if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID ||
2647  	    !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE))
2648  		return 0;
2649  
2650  	start = pmd_val(*pmd) & HPAGE_MASK;
2651  	end = start + HPAGE_SIZE - 1;
2652  	__storage_key_init_range(start, end);
2653  	set_bit(PG_arch_1, &page->flags);
2654  	cond_resched();
2655  	return 0;
2656  }
2657  
2658  static const struct mm_walk_ops enable_skey_walk_ops = {
2659  	.hugetlb_entry		= __s390_enable_skey_hugetlb,
2660  	.pte_entry		= __s390_enable_skey_pte,
2661  	.pmd_entry		= __s390_enable_skey_pmd,
2662  };
2663  
s390_enable_skey(void)2664  int s390_enable_skey(void)
2665  {
2666  	struct mm_struct *mm = current->mm;
2667  	int rc = 0;
2668  
2669  	mmap_write_lock(mm);
2670  	if (mm_uses_skeys(mm))
2671  		goto out_up;
2672  
2673  	mm->context.uses_skeys = 1;
2674  	rc = gmap_mark_unmergeable();
2675  	if (rc) {
2676  		mm->context.uses_skeys = 0;
2677  		goto out_up;
2678  	}
2679  	walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
2680  
2681  out_up:
2682  	mmap_write_unlock(mm);
2683  	return rc;
2684  }
2685  EXPORT_SYMBOL_GPL(s390_enable_skey);
2686  
2687  /*
2688   * Reset CMMA state, make all pages stable again.
2689   */
__s390_reset_cmma(pte_t * pte,unsigned long addr,unsigned long next,struct mm_walk * walk)2690  static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
2691  			     unsigned long next, struct mm_walk *walk)
2692  {
2693  	ptep_zap_unused(walk->mm, addr, pte, 1);
2694  	return 0;
2695  }
2696  
2697  static const struct mm_walk_ops reset_cmma_walk_ops = {
2698  	.pte_entry		= __s390_reset_cmma,
2699  };
2700  
s390_reset_cmma(struct mm_struct * mm)2701  void s390_reset_cmma(struct mm_struct *mm)
2702  {
2703  	mmap_write_lock(mm);
2704  	walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
2705  	mmap_write_unlock(mm);
2706  }
2707  EXPORT_SYMBOL_GPL(s390_reset_cmma);
2708  
2709  #define GATHER_GET_PAGES 32
2710  
2711  struct reset_walk_state {
2712  	unsigned long next;
2713  	unsigned long count;
2714  	unsigned long pfns[GATHER_GET_PAGES];
2715  };
2716  
s390_gather_pages(pte_t * ptep,unsigned long addr,unsigned long next,struct mm_walk * walk)2717  static int s390_gather_pages(pte_t *ptep, unsigned long addr,
2718  			     unsigned long next, struct mm_walk *walk)
2719  {
2720  	struct reset_walk_state *p = walk->private;
2721  	pte_t pte = READ_ONCE(*ptep);
2722  
2723  	if (pte_present(pte)) {
2724  		/* we have a reference from the mapping, take an extra one */
2725  		get_page(phys_to_page(pte_val(pte)));
2726  		p->pfns[p->count] = phys_to_pfn(pte_val(pte));
2727  		p->next = next;
2728  		p->count++;
2729  	}
2730  	return p->count >= GATHER_GET_PAGES;
2731  }
2732  
2733  static const struct mm_walk_ops gather_pages_ops = {
2734  	.pte_entry = s390_gather_pages,
2735  };
2736  
2737  /*
2738   * Call the Destroy secure page UVC on each page in the given array of PFNs.
2739   * Each page needs to have an extra reference, which will be released here.
2740   */
s390_uv_destroy_pfns(unsigned long count,unsigned long * pfns)2741  void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
2742  {
2743  	unsigned long i;
2744  
2745  	for (i = 0; i < count; i++) {
2746  		/* we always have an extra reference */
2747  		uv_destroy_owned_page(pfn_to_phys(pfns[i]));
2748  		/* get rid of the extra reference */
2749  		put_page(pfn_to_page(pfns[i]));
2750  		cond_resched();
2751  	}
2752  }
2753  EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns);
2754  
2755  /**
2756   * __s390_uv_destroy_range - Call the destroy secure page UVC on each page
2757   * in the given range of the given address space.
2758   * @mm: the mm to operate on
2759   * @start: the start of the range
2760   * @end: the end of the range
2761   * @interruptible: if not 0, stop when a fatal signal is received
2762   *
2763   * Walk the given range of the given address space and call the destroy
2764   * secure page UVC on each page. Optionally exit early if a fatal signal is
2765   * pending.
2766   *
2767   * Return: 0 on success, -EINTR if the function stopped before completing
2768   */
__s390_uv_destroy_range(struct mm_struct * mm,unsigned long start,unsigned long end,bool interruptible)2769  int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
2770  			    unsigned long end, bool interruptible)
2771  {
2772  	struct reset_walk_state state = { .next = start };
2773  	int r = 1;
2774  
2775  	while (r > 0) {
2776  		state.count = 0;
2777  		mmap_read_lock(mm);
2778  		r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state);
2779  		mmap_read_unlock(mm);
2780  		cond_resched();
2781  		s390_uv_destroy_pfns(state.count, state.pfns);
2782  		if (interruptible && fatal_signal_pending(current))
2783  			return -EINTR;
2784  	}
2785  	return 0;
2786  }
2787  EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
2788  
2789  /**
2790   * s390_unlist_old_asce - Remove the topmost level of page tables from the
2791   * list of page tables of the gmap.
2792   * @gmap: the gmap whose table is to be removed
2793   *
2794   * On s390x, KVM keeps a list of all pages containing the page tables of the
2795   * gmap (the CRST list). This list is used at tear down time to free all
2796   * pages that are now not needed anymore.
2797   *
2798   * This function removes the topmost page of the tree (the one pointed to by
2799   * the ASCE) from the CRST list.
2800   *
2801   * This means that it will not be freed when the VM is torn down, and needs
2802   * to be handled separately by the caller, unless a leak is actually
2803   * intended. Notice that this function will only remove the page from the
2804   * list, the page will still be used as a top level page table (and ASCE).
2805   */
s390_unlist_old_asce(struct gmap * gmap)2806  void s390_unlist_old_asce(struct gmap *gmap)
2807  {
2808  	struct page *old;
2809  
2810  	old = virt_to_page(gmap->table);
2811  	spin_lock(&gmap->guest_table_lock);
2812  	list_del(&old->lru);
2813  	/*
2814  	 * Sometimes the topmost page might need to be "removed" multiple
2815  	 * times, for example if the VM is rebooted into secure mode several
2816  	 * times concurrently, or if s390_replace_asce fails after calling
2817  	 * s390_remove_old_asce and is attempted again later. In that case
2818  	 * the old asce has been removed from the list, and therefore it
2819  	 * will not be freed when the VM terminates, but the ASCE is still
2820  	 * in use and still pointed to.
2821  	 * A subsequent call to replace_asce will follow the pointer and try
2822  	 * to remove the same page from the list again.
2823  	 * Therefore it's necessary that the page of the ASCE has valid
2824  	 * pointers, so list_del can work (and do nothing) without
2825  	 * dereferencing stale or invalid pointers.
2826  	 */
2827  	INIT_LIST_HEAD(&old->lru);
2828  	spin_unlock(&gmap->guest_table_lock);
2829  }
2830  EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
2831  
2832  /**
2833   * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
2834   * @gmap: the gmap whose ASCE needs to be replaced
2835   *
2836   * If the allocation of the new top level page table fails, the ASCE is not
2837   * replaced.
2838   * In any case, the old ASCE is always removed from the gmap CRST list.
2839   * Therefore the caller has to make sure to save a pointer to it
2840   * beforehand, unless a leak is actually intended.
2841   */
s390_replace_asce(struct gmap * gmap)2842  int s390_replace_asce(struct gmap *gmap)
2843  {
2844  	unsigned long asce;
2845  	struct page *page;
2846  	void *table;
2847  
2848  	s390_unlist_old_asce(gmap);
2849  
2850  	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
2851  	if (!page)
2852  		return -ENOMEM;
2853  	table = page_to_virt(page);
2854  	memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
2855  
2856  	/*
2857  	 * The caller has to deal with the old ASCE, but here we make sure
2858  	 * the new one is properly added to the CRST list, so that
2859  	 * it will be freed when the VM is torn down.
2860  	 */
2861  	spin_lock(&gmap->guest_table_lock);
2862  	list_add(&page->lru, &gmap->crst_list);
2863  	spin_unlock(&gmap->guest_table_lock);
2864  
2865  	/* Set new table origin while preserving existing ASCE control bits */
2866  	asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
2867  	WRITE_ONCE(gmap->asce, asce);
2868  	WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
2869  	WRITE_ONCE(gmap->table, table);
2870  
2871  	return 0;
2872  }
2873  EXPORT_SYMBOL_GPL(s390_replace_asce);
2874