1 /*
2 * pci.c: HVM PCI setup.
3 *
4 * Leendert van Doorn, leendert@watson.ibm.com
5 * Copyright (c) 2005, International Business Machines Corporation.
6 *
7 * Copyright (c) 2006, Keir Fraser, XenSource Inc.
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms and conditions of the GNU General Public License,
11 * version 2, as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope it will be useful, but WITHOUT
14 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 * more details.
17 *
18 * You should have received a copy of the GNU General Public License along with
19 * this program; If not, see <http://www.gnu.org/licenses/>.
20 */
21
22 #include "util.h"
23 #include "hypercall.h"
24 #include "config.h"
25 #include "pci_regs.h"
26
27 #include <xen/memory.h>
28 #include <xen/hvm/ioreq.h>
29 #include <xen/hvm/hvm_xs_strings.h>
30 #include <xen/hvm/e820.h>
31
32 uint32_t pci_mem_start = HVM_BELOW_4G_MMIO_START;
33 const uint32_t pci_mem_end = RESERVED_MEMBASE;
34 uint64_t pci_hi_mem_start = 0, pci_hi_mem_end = 0;
35
36 /*
37 * BARs larger than this value are put in 64-bit space unconditionally. That
38 * is, such BARs also don't play into the determination of how big the lowmem
39 * MMIO hole needs to be.
40 */
41 #define BAR_RELOC_THRESH GB(1)
42
43 enum virtual_vga virtual_vga = VGA_none;
44 unsigned long igd_opregion_pgbase = 0;
45
46 /* Check if the specified range conflicts with any reserved device memory. */
check_overlap_all(uint64_t start,uint64_t size)47 static bool check_overlap_all(uint64_t start, uint64_t size)
48 {
49 unsigned int i;
50
51 for ( i = 0; i < memory_map.nr_map; i++ )
52 {
53 if ( memory_map.map[i].type == E820_RESERVED &&
54 check_overlap(start, size,
55 memory_map.map[i].addr,
56 memory_map.map[i].size) )
57 return true;
58 }
59
60 return false;
61 }
62
63 /* Find the lowest RMRR ending above base but below 4G. */
find_next_rmrr(uint32_t base)64 static int find_next_rmrr(uint32_t base)
65 {
66 unsigned int i;
67 int next_rmrr = -1;
68 uint64_t end, min_end = GB(4);
69
70 for ( i = 0; i < memory_map.nr_map ; i++ )
71 {
72 end = memory_map.map[i].addr + memory_map.map[i].size;
73
74 if ( memory_map.map[i].type == E820_RESERVED &&
75 end > base && end <= min_end )
76 {
77 next_rmrr = i;
78 min_end = end;
79 }
80 }
81
82 return next_rmrr;
83 }
84
pci_setup(void)85 void pci_setup(void)
86 {
87 uint8_t is_64bar, using_64bar, bar64_relocate = 0;
88 uint32_t devfn, bar_reg, cmd, bar_data, bar_data_upper;
89 uint64_t base, bar_sz, bar_sz_upper, mmio_total = 0;
90 uint32_t vga_devfn = 256;
91 uint16_t class, vendor_id, device_id;
92 unsigned int bar, pin, link, isa_irq;
93 uint8_t pci_devfn_decode_type[256] = {};
94
95 /* Resources assignable to PCI devices via BARs. */
96 struct resource {
97 uint64_t base, max;
98 } *resource, mem_resource, high_mem_resource, io_resource;
99
100 /* Create a list of device BARs in descending order of size. */
101 struct bars {
102 uint32_t is_64bar;
103 uint32_t devfn;
104 uint32_t bar_reg;
105 uint64_t bar_sz;
106 } *bars = (struct bars *)scratch_start;
107 unsigned int i, nr_bars = 0;
108 uint64_t mmio_hole_size = 0;
109
110 const char *s;
111 /*
112 * Do we allow hvmloader to relocate guest memory in order to
113 * increase the size of the lowmem MMIO hole? Defaulting to 1
114 * here will mean that non-libxl toolstacks (including xend and
115 * home-grown ones) means that those using qemu-xen will still
116 * experience the memory relocation bug described below; but it
117 * also means that those using qemu-traditional will *not*
118 * experience any change; and it also means that there is a
119 * work-around for those using qemu-xen, namely switching to
120 * qemu-traditional.
121 *
122 * If we defaulted to 0, and failing to resize the hole caused any
123 * problems with qemu-traditional, then there is no work-around.
124 *
125 * Since xend can only use qemu-traditional, I think this is the
126 * option that will have the least impact.
127 */
128 bool allow_memory_relocate = 1;
129
130 BUILD_BUG_ON((typeof(*pci_devfn_decode_type))PCI_COMMAND_IO !=
131 PCI_COMMAND_IO);
132 BUILD_BUG_ON((typeof(*pci_devfn_decode_type))PCI_COMMAND_MEMORY !=
133 PCI_COMMAND_MEMORY);
134 BUILD_BUG_ON((typeof(*pci_devfn_decode_type))PCI_COMMAND_MASTER !=
135 PCI_COMMAND_MASTER);
136
137 s = xenstore_read(HVM_XS_ALLOW_MEMORY_RELOCATE, NULL);
138 if ( s )
139 allow_memory_relocate = strtoll(s, NULL, 0);
140 printf("Relocating guest memory for lowmem MMIO space %s\n",
141 allow_memory_relocate?"enabled":"disabled");
142
143 s = xenstore_read("platform/mmio_hole_size", NULL);
144 if ( s )
145 mmio_hole_size = strtoll(s, NULL, 0);
146
147 /* Program PCI-ISA bridge with appropriate link routes. */
148 isa_irq = 0;
149 for ( link = 0; link < 4; link++ )
150 {
151 do { isa_irq = (isa_irq + 1) & 15;
152 } while ( !(PCI_ISA_IRQ_MASK & (1U << isa_irq)) );
153 pci_writeb(PCI_ISA_DEVFN, 0x60 + link, isa_irq);
154 printf("PCI-ISA link %u routed to IRQ%u\n", link, isa_irq);
155 }
156
157 /* Program ELCR to match PCI-wired IRQs. */
158 outb(0x4d0, (uint8_t)(PCI_ISA_IRQ_MASK >> 0));
159 outb(0x4d1, (uint8_t)(PCI_ISA_IRQ_MASK >> 8));
160
161 /* Scan the PCI bus and map resources. */
162 for ( devfn = 0; devfn < 256; devfn++ )
163 {
164 class = pci_readw(devfn, PCI_CLASS_DEVICE);
165 vendor_id = pci_readw(devfn, PCI_VENDOR_ID);
166 device_id = pci_readw(devfn, PCI_DEVICE_ID);
167 if ( (vendor_id == 0xffff) && (device_id == 0xffff) )
168 continue;
169
170 ASSERT((devfn != PCI_ISA_DEVFN) ||
171 ((vendor_id == 0x8086) && (device_id == 0x7000)));
172
173 switch ( class )
174 {
175 case 0x0300:
176 /* If emulated VGA is found, preserve it as primary VGA. */
177 if ( (vendor_id == 0x1234) && (device_id == 0x1111) )
178 {
179 vga_devfn = devfn;
180 virtual_vga = VGA_std;
181 }
182 else if ( (vendor_id == 0x1013) && (device_id == 0xb8) )
183 {
184 vga_devfn = devfn;
185 virtual_vga = VGA_cirrus;
186 }
187 else if ( virtual_vga == VGA_none )
188 {
189 vga_devfn = devfn;
190 virtual_vga = VGA_pt;
191 if ( vendor_id == 0x8086 )
192 {
193 igd_opregion_pgbase = mem_hole_alloc(IGD_OPREGION_PAGES);
194 /*
195 * Write the the OpRegion offset to give the opregion
196 * address to the device model. The device model will trap
197 * and map the OpRegion at the give address.
198 */
199 pci_writel(vga_devfn, PCI_INTEL_OPREGION,
200 igd_opregion_pgbase << PAGE_SHIFT);
201 }
202 }
203 break;
204 case 0x0680:
205 /* PIIX4 ACPI PM. Special device with special PCI config space. */
206 ASSERT((vendor_id == 0x8086) && (device_id == 0x7113));
207 pci_writew(devfn, 0x20, 0x0000); /* No smb bus IO enable */
208 pci_writew(devfn, 0xd2, 0x0000); /* No smb bus IO enable */
209 pci_writew(devfn, 0x22, 0x0000);
210 pci_writew(devfn, 0x3c, 0x0009); /* Hardcoded IRQ9 */
211 pci_writew(devfn, 0x3d, 0x0001);
212 pci_writel(devfn, 0x40, ACPI_PM1A_EVT_BLK_ADDRESS_V1 | 1);
213 pci_writeb(devfn, 0x80, 0x01); /* enable PM io space */
214 break;
215 case 0x0101:
216 if ( vendor_id == 0x8086 )
217 {
218 /* Intel ICHs since PIIX3: enable IDE legacy mode. */
219 pci_writew(devfn, 0x40, 0x8000); /* enable IDE0 */
220 pci_writew(devfn, 0x42, 0x8000); /* enable IDE1 */
221 }
222 break;
223 }
224
225 /*
226 * It is recommended that BAR programming be done whilst decode
227 * bits are cleared to avoid incorrect mappings being created.
228 * When 64-bit memory BAR is programmed, first by writing the
229 * lower half and then the upper half, which maps to an address
230 * under 4G, as soon as lower half is wriiten, replacing any RAM
231 * mapped in that address, which is not restored back after the
232 * upper half is written and PCI memory is correctly mapped to
233 * its intended high mem address.
234 */
235 cmd = pci_readw(devfn, PCI_COMMAND);
236 cmd &= ~(PCI_COMMAND_MEMORY | PCI_COMMAND_IO);
237 pci_writew(devfn, PCI_COMMAND, cmd);
238
239 /* Map the I/O memory and port resources. */
240 for ( bar = 0; bar < 7; bar++ )
241 {
242 bar_sz_upper = 0;
243 bar_reg = PCI_BASE_ADDRESS_0 + 4*bar;
244 if ( bar == 6 )
245 bar_reg = PCI_ROM_ADDRESS;
246
247 bar_data = pci_readl(devfn, bar_reg);
248 if ( bar_reg != PCI_ROM_ADDRESS )
249 {
250 is_64bar = !!((bar_data & (PCI_BASE_ADDRESS_SPACE |
251 PCI_BASE_ADDRESS_MEM_TYPE_MASK)) ==
252 (PCI_BASE_ADDRESS_SPACE_MEMORY |
253 PCI_BASE_ADDRESS_MEM_TYPE_64));
254 pci_writel(devfn, bar_reg, ~0);
255 }
256 else
257 {
258 is_64bar = 0;
259 pci_writel(devfn, bar_reg,
260 (bar_data | PCI_ROM_ADDRESS_MASK) &
261 ~PCI_ROM_ADDRESS_ENABLE);
262 }
263 bar_sz = pci_readl(devfn, bar_reg);
264 pci_writel(devfn, bar_reg, bar_data);
265
266 if ( bar_reg != PCI_ROM_ADDRESS )
267 bar_sz &= (((bar_data & PCI_BASE_ADDRESS_SPACE) ==
268 PCI_BASE_ADDRESS_SPACE_MEMORY) ?
269 PCI_BASE_ADDRESS_MEM_MASK :
270 (PCI_BASE_ADDRESS_IO_MASK & 0xffff));
271 else
272 bar_sz &= PCI_ROM_ADDRESS_MASK;
273 if (is_64bar) {
274 bar_data_upper = pci_readl(devfn, bar_reg + 4);
275 pci_writel(devfn, bar_reg + 4, ~0);
276 bar_sz_upper = pci_readl(devfn, bar_reg + 4);
277 pci_writel(devfn, bar_reg + 4, bar_data_upper);
278 bar_sz = (bar_sz_upper << 32) | bar_sz;
279 }
280 bar_sz &= ~(bar_sz - 1);
281 if ( bar_sz == 0 )
282 continue;
283
284 for ( i = 0; i < nr_bars; i++ )
285 if ( bars[i].bar_sz < bar_sz )
286 break;
287
288 if ( i != nr_bars )
289 memmove(&bars[i+1], &bars[i], (nr_bars-i) * sizeof(*bars));
290
291 bars[i].is_64bar = is_64bar;
292 bars[i].devfn = devfn;
293 bars[i].bar_reg = bar_reg;
294 bars[i].bar_sz = bar_sz;
295
296 if ( is_64bar && bar_sz > BAR_RELOC_THRESH )
297 bar64_relocate = 1;
298 else if ( ((bar_data & PCI_BASE_ADDRESS_SPACE) ==
299 PCI_BASE_ADDRESS_SPACE_MEMORY) ||
300 (bar_reg == PCI_ROM_ADDRESS) )
301 mmio_total += bar_sz;
302
303 nr_bars++;
304
305 /*The upper half is already calculated, skip it! */
306 if (is_64bar)
307 bar++;
308 }
309
310 /* Map the interrupt. */
311 pin = pci_readb(devfn, PCI_INTERRUPT_PIN);
312 if ( pin != 0 )
313 {
314 /* This is the barber's pole mapping used by Xen. */
315 link = ((pin - 1) + (devfn >> 3)) & 3;
316 isa_irq = pci_readb(PCI_ISA_DEVFN, 0x60 + link);
317 pci_writeb(devfn, PCI_INTERRUPT_LINE, isa_irq);
318 printf("pci dev %02x:%x INT%c->IRQ%u\n",
319 devfn>>3, devfn&7, 'A'+pin-1, isa_irq);
320 }
321
322 /* Enable bus master for this function later */
323 pci_devfn_decode_type[devfn] = PCI_COMMAND_MASTER;
324 }
325
326 if ( mmio_hole_size )
327 {
328 uint64_t max_ram_below_4g = GB(4) - mmio_hole_size;
329
330 if ( max_ram_below_4g > HVM_BELOW_4G_MMIO_START )
331 {
332 printf("max_ram_below_4g=0x"PRIllx
333 " too big for mmio_hole_size=0x"PRIllx
334 " has been ignored.\n",
335 PRIllx_arg(max_ram_below_4g),
336 PRIllx_arg(mmio_hole_size));
337 }
338 else
339 {
340 pci_mem_start = max_ram_below_4g;
341 printf("pci_mem_start=0x%x (was 0x%x) for mmio_hole_size=0x%lx\n",
342 pci_mem_start, HVM_BELOW_4G_MMIO_START,
343 (long)mmio_hole_size);
344 }
345 }
346 else
347 {
348 /*
349 * At the moment qemu-xen can't deal with relocated memory regions.
350 * It's too close to the release to make a proper fix; for now,
351 * only allow the MMIO hole to grow large enough to move guest memory
352 * if we're running qemu-traditional. Items that don't fit will be
353 * relocated into the 64-bit address space.
354 *
355 * This loop now does the following:
356 * - If allow_memory_relocate, increase the MMIO hole until it's
357 * big enough, or until it's 2GiB
358 * - If !allow_memory_relocate, increase the MMIO hole until it's
359 * big enough, or until it's 2GiB, or until it overlaps guest
360 * memory
361 */
362 while ( (mmio_total > (pci_mem_end - pci_mem_start))
363 && ((pci_mem_start << 1) != 0)
364 && (allow_memory_relocate
365 || (((pci_mem_start << 1) >> PAGE_SHIFT)
366 >= hvm_info->low_mem_pgend)) )
367 pci_mem_start <<= 1;
368
369 /*
370 * Try to accommodate RMRRs in our MMIO region on a best-effort basis.
371 * If we have RMRRs in the range, then make pci_mem_start just after
372 * hvm_info->low_mem_pgend.
373 */
374 if ( pci_mem_start > (hvm_info->low_mem_pgend << PAGE_SHIFT) &&
375 check_overlap_all(pci_mem_start, pci_mem_end-pci_mem_start) )
376 pci_mem_start = hvm_info->low_mem_pgend << PAGE_SHIFT;
377 }
378
379 if ( mmio_total > (pci_mem_end - pci_mem_start) || bar64_relocate )
380 {
381 printf("Low MMIO hole not large enough for all devices,"
382 " relocating some BARs to 64-bit\n");
383 bar64_relocate = 1;
384 }
385
386 /* Relocate RAM that overlaps PCI space (in 64k-page chunks). */
387 while ( (pci_mem_start >> PAGE_SHIFT) < hvm_info->low_mem_pgend )
388 {
389 struct xen_add_to_physmap xatp;
390 unsigned int nr_pages = min_t(
391 unsigned int,
392 hvm_info->low_mem_pgend - (pci_mem_start >> PAGE_SHIFT),
393 (1u << 16) - 1);
394 if ( hvm_info->high_mem_pgend == 0 )
395 hvm_info->high_mem_pgend = 1ull << (32 - PAGE_SHIFT);
396 hvm_info->low_mem_pgend -= nr_pages;
397 printf("Relocating 0x%x pages from "PRIllx" to "PRIllx\
398 " for lowmem MMIO hole\n",
399 nr_pages,
400 PRIllx_arg(((uint64_t)hvm_info->low_mem_pgend)<<PAGE_SHIFT),
401 PRIllx_arg(((uint64_t)hvm_info->high_mem_pgend)<<PAGE_SHIFT));
402 xatp.domid = DOMID_SELF;
403 xatp.space = XENMAPSPACE_gmfn_range;
404 xatp.idx = hvm_info->low_mem_pgend;
405 xatp.gpfn = hvm_info->high_mem_pgend;
406 xatp.size = nr_pages;
407 if ( hypercall_memory_op(XENMEM_add_to_physmap, &xatp) != 0 )
408 BUG();
409 hvm_info->high_mem_pgend += nr_pages;
410 }
411
412 /* Sync memory map[] if necessary. */
413 adjust_memory_map();
414
415 high_mem_resource.base = ((uint64_t)hvm_info->high_mem_pgend) << PAGE_SHIFT;
416 if ( high_mem_resource.base < GB(4) )
417 {
418 if ( hvm_info->high_mem_pgend != 0 )
419 printf("WARNING: hvm_info->high_mem_pgend %x"
420 " does not point into high memory!",
421 hvm_info->high_mem_pgend);
422 high_mem_resource.base = GB(4);
423 }
424 printf("%sRAM in high memory; setting high_mem resource base to "PRIllx"\n",
425 hvm_info->high_mem_pgend?"":"No ",
426 PRIllx_arg(high_mem_resource.base));
427 high_mem_resource.max = 1ull << cpu_phys_addr();
428 mem_resource.base = pci_mem_start;
429 mem_resource.max = pci_mem_end;
430 io_resource.base = 0xc000;
431 io_resource.max = 0x10000;
432
433 /* Assign iomem and ioport resources in descending order of size. */
434 for ( i = 0; i < nr_bars; i++ )
435 {
436 devfn = bars[i].devfn;
437 bar_reg = bars[i].bar_reg;
438 bar_sz = bars[i].bar_sz;
439
440 /*
441 * Relocate to high memory if the total amount of MMIO needed
442 * is more than the low MMIO available or BARs bigger than
443 * BAR_RELOC_THRESH are present. Because devices are
444 * processed in order of bar_sz, this will preferentially
445 * relocate larger devices to high memory first.
446 *
447 * NB: The code here is rather fragile, as the check here to see
448 * whether bar_sz will fit in the low MMIO region doesn't match the
449 * real check made below, which involves aligning the base offset of the
450 * bar with the size of the bar itself. As it happens, this will always
451 * be satisfied because:
452 * - The first one will succeed because the MMIO hole can only start at
453 * 0x{f,e,c,8}00000000. If it fits, it will be aligned properly.
454 * - All subsequent ones will be aligned because the list is ordered
455 * large to small, and bar_sz is always a power of 2. (At least
456 * the code here assumes it to be.)
457 * Should either of those two conditions change, this code will break.
458 */
459 using_64bar = bars[i].is_64bar && bar64_relocate &&
460 (mmio_total > (mem_resource.max - mem_resource.base) ||
461 bar_sz > BAR_RELOC_THRESH);
462 bar_data = pci_readl(devfn, bar_reg);
463
464 if ( (bar_data & PCI_BASE_ADDRESS_SPACE) ==
465 PCI_BASE_ADDRESS_SPACE_MEMORY )
466 {
467 /* Mapping high memory if PCI device is 64 bits bar */
468 if ( using_64bar ) {
469 if ( high_mem_resource.base & (bar_sz - 1) )
470 high_mem_resource.base = high_mem_resource.base -
471 (high_mem_resource.base & (bar_sz - 1)) + bar_sz;
472 if ( !pci_hi_mem_start )
473 pci_hi_mem_start = high_mem_resource.base;
474 resource = &high_mem_resource;
475 bar_data &= ~PCI_BASE_ADDRESS_MEM_MASK;
476 }
477 else {
478 resource = &mem_resource;
479 bar_data &= ~PCI_BASE_ADDRESS_MEM_MASK;
480 }
481 if ( bar_sz <= BAR_RELOC_THRESH )
482 mmio_total -= bar_sz;
483 }
484 else
485 {
486 resource = &io_resource;
487 bar_data &= ~PCI_BASE_ADDRESS_IO_MASK;
488 }
489
490 base = (resource->base + bar_sz - 1) & ~(uint64_t)(bar_sz - 1);
491
492 /* If we're using mem_resource, check for RMRR conflicts. */
493 if ( resource == &mem_resource)
494 {
495 int next_rmrr = find_next_rmrr(base);
496
497 while ( next_rmrr >= 0 &&
498 check_overlap(base, bar_sz,
499 memory_map.map[next_rmrr].addr,
500 memory_map.map[next_rmrr].size) )
501 {
502 base = memory_map.map[next_rmrr].addr +
503 memory_map.map[next_rmrr].size;
504 base = (base + bar_sz - 1) & ~(bar_sz - 1);
505 next_rmrr = find_next_rmrr(base);
506 }
507 }
508
509 bar_data |= (uint32_t)base;
510 bar_data_upper = (uint32_t)(base >> 32);
511 base += bar_sz;
512
513 if ( (base < resource->base) || (base > resource->max) )
514 {
515 printf("pci dev %02x:%x bar %02x size "PRIllx": no space for "
516 "resource!\n", devfn>>3, devfn&7, bar_reg,
517 PRIllx_arg(bar_sz));
518 continue;
519 }
520
521 resource->base = base;
522
523 pci_writel(devfn, bar_reg, bar_data);
524 if (using_64bar)
525 pci_writel(devfn, bar_reg + 4, bar_data_upper);
526 printf("pci dev %02x:%x bar %02x size "PRIllx": %x%08x\n",
527 devfn>>3, devfn&7, bar_reg,
528 PRIllx_arg(bar_sz),
529 bar_data_upper, bar_data);
530
531 if ( (bar_reg == PCI_ROM_ADDRESS) ||
532 ((bar_data & PCI_BASE_ADDRESS_SPACE) ==
533 PCI_BASE_ADDRESS_SPACE_MEMORY) )
534 pci_devfn_decode_type[devfn] |= PCI_COMMAND_MEMORY;
535 else
536 pci_devfn_decode_type[devfn] |= PCI_COMMAND_IO;
537 }
538
539 if ( pci_hi_mem_start )
540 {
541 /*
542 * Make end address alignment match the start address one's so that
543 * fewer variable range MTRRs are needed to cover the range.
544 */
545 pci_hi_mem_end = ((high_mem_resource.base - 1) |
546 ((pci_hi_mem_start & -pci_hi_mem_start) - 1)) + 1;
547 }
548
549 if ( vga_devfn != 256 )
550 {
551 /*
552 * VGA registers live in I/O space so ensure that primary VGA
553 * has IO enabled, even if there is no I/O BAR on that
554 * particular device.
555 */
556 pci_devfn_decode_type[vga_devfn] |= PCI_COMMAND_IO;
557 }
558
559 /* Enable bus master, memory and I/O decode for all valid functions. */
560 for ( devfn = 0; devfn < 256; devfn++ )
561 if ( pci_devfn_decode_type[devfn] )
562 {
563 cmd = pci_readw(devfn, PCI_COMMAND);
564 cmd |= pci_devfn_decode_type[devfn];
565 pci_writew(devfn, PCI_COMMAND, cmd);
566 }
567 }
568
569 /*
570 * Local variables:
571 * mode: C
572 * c-file-style: "BSD"
573 * c-basic-offset: 4
574 * tab-width: 4
575 * indent-tabs-mode: nil
576 * End:
577 */
578