| /* |
| * PPC64 (SPAPR) platform support |
| * |
| * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation. |
| * |
| * Portions of FDT setup borrowed from QEMU, copyright 2010 David Gibson, IBM |
| * Corporation. |
| * |
| * This program is free software; you can redistribute it and/or modify it |
| * under the terms of the GNU General Public License version 2 as published |
| * by the Free Software Foundation. |
| */ |
| |
| #include "kvm/kvm.h" |
| #include "kvm/util.h" |
| #include "libfdt.h" |
| #include "cpu_info.h" |
| |
| #include "spapr.h" |
| #include "spapr_hvcons.h" |
| #include "spapr_pci.h" |
| |
| #include <linux/kvm.h> |
| |
| #include <sys/types.h> |
| #include <sys/ioctl.h> |
| #include <sys/mman.h> |
| #include <stdbool.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <unistd.h> |
| #include <stdio.h> |
| #include <fcntl.h> |
| #include <asm/unistd.h> |
| #include <errno.h> |
| |
| #include <linux/byteorder.h> |
| |
| #define HPT_ORDER 24 |
| |
| #define HUGETLBFS_PATH "/var/lib/hugetlbfs/global/pagesize-16MB/" |
| |
| #define PHANDLE_XICP 0x00001111 |
| |
| static char kern_cmdline[2048]; |
| |
| struct kvm_ext kvm_req_ext[] = { |
| { DEFINE_KVM_EXT(KVM_CAP_PPC_UNSET_IRQ) }, |
| { DEFINE_KVM_EXT(KVM_CAP_PPC_IRQ_LEVEL) }, |
| { 0, 0 } |
| }; |
| |
| static uint32_t mfpvr(void) |
| { |
| uint32_t r; |
| asm volatile ("mfpvr %0" : "=r"(r)); |
| return r; |
| } |
| |
| bool kvm__arch_cpu_supports_vm(void) |
| { |
| return true; |
| } |
| |
| void kvm__init_ram(struct kvm *kvm) |
| { |
| u64 phys_start, phys_size; |
| void *host_mem; |
| |
| phys_start = 0; |
| phys_size = kvm->ram_size; |
| host_mem = kvm->ram_start; |
| |
| /* |
| * We put MMIO at PPC_MMIO_START, high up. Make sure that this doesn't |
| * crash into the end of RAM -- on PPC64 at least, this is so high |
| * (63TB!) that this is unlikely. |
| */ |
| if (phys_size >= PPC_MMIO_START) |
| die("Too much memory (%lld, what a nice problem): " |
| "overlaps MMIO!\n", |
| phys_size); |
| |
| kvm__register_mem(kvm, phys_start, phys_size, host_mem); |
| } |
| |
| void kvm__arch_set_cmdline(char *cmdline, bool video) |
| { |
| /* We don't need anything unusual in here. */ |
| } |
| |
| /* Architecture-specific KVM init */ |
| void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size) |
| { |
| int cap_ppc_rma; |
| unsigned long hpt; |
| |
| kvm->ram_size = ram_size; |
| |
| /* Map "default" hugetblfs path to the standard 16M mount point */ |
| if (hugetlbfs_path && !strcmp(hugetlbfs_path, "default")) |
| hugetlbfs_path = HUGETLBFS_PATH; |
| |
| kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, kvm->ram_size); |
| |
| if (kvm->ram_start == MAP_FAILED) |
| die("Couldn't map %lld bytes for RAM (%d)\n", |
| kvm->ram_size, errno); |
| |
| /* FDT goes at top of memory, RTAS just below */ |
| kvm->arch.fdt_gra = kvm->ram_size - FDT_MAX_SIZE; |
| /* FIXME: Not all PPC systems have RTAS */ |
| kvm->arch.rtas_gra = kvm->arch.fdt_gra - RTAS_MAX_SIZE; |
| madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE); |
| |
| /* FIXME: SPAPR-PR specific; allocate a guest HPT. */ |
| if (posix_memalign((void **)&hpt, (1<<HPT_ORDER), (1<<HPT_ORDER))) |
| die("Can't allocate %d bytes for HPT\n", (1<<HPT_ORDER)); |
| |
| kvm->arch.sdr1 = ((hpt + 0x3ffffULL) & ~0x3ffffULL) | (HPT_ORDER-18); |
| |
| kvm->arch.pvr = mfpvr(); |
| |
| /* FIXME: This is book3s-specific */ |
| cap_ppc_rma = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_PPC_RMA); |
| if (cap_ppc_rma == 2) |
| die("Need contiguous RMA allocation on this hardware, " |
| "which is not yet supported."); |
| |
| /* Do these before FDT setup, IRQ setup, etc. */ |
| /* FIXME: SPAPR-specific */ |
| hypercall_init(); |
| register_core_rtas(); |
| /* Now that hypercalls are initialised, register a couple for the console: */ |
| spapr_hvcons_init(); |
| spapr_create_phb(kvm, "pci", SPAPR_PCI_BUID, |
| SPAPR_PCI_MEM_WIN_ADDR, |
| SPAPR_PCI_MEM_WIN_SIZE, |
| SPAPR_PCI_IO_WIN_ADDR, |
| SPAPR_PCI_IO_WIN_SIZE); |
| } |
| |
| void kvm__arch_delete_ram(struct kvm *kvm) |
| { |
| munmap(kvm->ram_start, kvm->ram_size); |
| } |
| |
| void kvm__irq_trigger(struct kvm *kvm, int irq) |
| { |
| kvm__irq_line(kvm, irq, 1); |
| kvm__irq_line(kvm, irq, 0); |
| } |
| |
| void kvm__arch_periodic_poll(struct kvm *kvm) |
| { |
| /* FIXME: Should register callbacks to platform-specific polls */ |
| spapr_hvcons_poll(kvm); |
| } |
| |
| int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline) |
| { |
| void *p; |
| void *k_start; |
| void *i_start; |
| int nr; |
| |
| if (lseek(fd_kernel, 0, SEEK_SET) < 0) |
| die_perror("lseek"); |
| |
| p = k_start = guest_flat_to_host(kvm, KERNEL_LOAD_ADDR); |
| |
| while ((nr = read(fd_kernel, p, 65536)) > 0) |
| p += nr; |
| |
| pr_info("Loaded kernel to 0x%x (%ld bytes)", KERNEL_LOAD_ADDR, p-k_start); |
| |
| if (fd_initrd != -1) { |
| if (lseek(fd_initrd, 0, SEEK_SET) < 0) |
| die_perror("lseek"); |
| |
| if (p-k_start > INITRD_LOAD_ADDR) |
| die("Kernel overlaps initrd!"); |
| |
| /* Round up kernel size to 8byte alignment, and load initrd right after. */ |
| i_start = p = guest_flat_to_host(kvm, INITRD_LOAD_ADDR); |
| |
| while (((nr = read(fd_initrd, p, 65536)) > 0) && |
| p < (kvm->ram_start + kvm->ram_size)) |
| p += nr; |
| |
| if (p >= (kvm->ram_start + kvm->ram_size)) |
| die("initrd too big to contain in guest RAM.\n"); |
| |
| pr_info("Loaded initrd to 0x%x (%ld bytes)", |
| INITRD_LOAD_ADDR, p-i_start); |
| kvm->arch.initrd_gra = INITRD_LOAD_ADDR; |
| kvm->arch.initrd_size = p-i_start; |
| } else { |
| kvm->arch.initrd_size = 0; |
| } |
| strncpy(kern_cmdline, kernel_cmdline, 2048); |
| kern_cmdline[2047] = '\0'; |
| |
| return true; |
| } |
| |
| bool load_bzimage(struct kvm *kvm, int fd_kernel, |
| int fd_initrd, const char *kernel_cmdline, u16 vidmode) |
| { |
| /* We don't support bzImages. */ |
| return false; |
| } |
| |
| struct fdt_prop { |
| void *value; |
| int size; |
| }; |
| |
| static void generate_segment_page_sizes(struct kvm_ppc_smmu_info *info, struct fdt_prop *prop) |
| { |
| struct kvm_ppc_one_seg_page_size *sps; |
| int i, j, size; |
| u32 *p; |
| |
| for (size = 0, i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) { |
| sps = &info->sps[i]; |
| |
| if (sps->page_shift == 0) |
| break; |
| |
| /* page shift, slb enc & count */ |
| size += 3; |
| |
| for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) { |
| if (info->sps[i].enc[j].page_shift == 0) |
| break; |
| |
| /* page shift & pte enc */ |
| size += 2; |
| } |
| } |
| |
| if (!size) { |
| prop->value = NULL; |
| prop->size = 0; |
| return; |
| } |
| |
| /* Convert size to bytes */ |
| prop->size = size * sizeof(u32); |
| |
| prop->value = malloc(prop->size); |
| if (!prop->value) |
| die_perror("malloc failed"); |
| |
| p = (u32 *)prop->value; |
| for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) { |
| sps = &info->sps[i]; |
| |
| if (sps->page_shift == 0) |
| break; |
| |
| *p++ = sps->page_shift; |
| *p++ = sps->slb_enc; |
| |
| for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) |
| if (!info->sps[i].enc[j].page_shift) |
| break; |
| |
| *p++ = j; /* count of enc */ |
| |
| for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) { |
| if (!info->sps[i].enc[j].page_shift) |
| break; |
| |
| *p++ = info->sps[i].enc[j].page_shift; |
| *p++ = info->sps[i].enc[j].pte_enc; |
| } |
| } |
| } |
| |
| #define SMT_THREADS 4 |
| |
| /* |
| * Set up the FDT for the kernel: This function is currently fairly SPAPR-heavy, |
| * and whilst most PPC targets will require CPU/memory nodes, others like RTAS |
| * should eventually be added separately. |
| */ |
| static void setup_fdt(struct kvm *kvm) |
| { |
| uint64_t mem_reg_property[] = { 0, cpu_to_be64(kvm->ram_size) }; |
| int smp_cpus = kvm->nrcpus; |
| uint32_t int_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)}; |
| char hypertas_prop_kvm[] = "hcall-pft\0hcall-term\0" |
| "hcall-dabr\0hcall-interrupt\0hcall-tce\0hcall-vio\0" |
| "hcall-splpar\0hcall-bulk"; |
| int i, j; |
| char cpu_name[30]; |
| u8 staging_fdt[FDT_MAX_SIZE]; |
| struct cpu_info *cpu_info = find_cpu_info(kvm); |
| struct fdt_prop segment_page_sizes; |
| u32 segment_sizes_1T[] = {0x1c, 0x28, 0xffffffff, 0xffffffff}; |
| |
| /* Generate an appropriate DT at kvm->arch.fdt_gra */ |
| void *fdt_dest = guest_flat_to_host(kvm, kvm->arch.fdt_gra); |
| void *fdt = staging_fdt; |
| |
| _FDT(fdt_create(fdt, FDT_MAX_SIZE)); |
| _FDT(fdt_finish_reservemap(fdt)); |
| |
| _FDT(fdt_begin_node(fdt, "")); |
| |
| _FDT(fdt_property_string(fdt, "device_type", "chrp")); |
| _FDT(fdt_property_string(fdt, "model", "IBM pSeries (kvmtool)")); |
| _FDT(fdt_property_cell(fdt, "#address-cells", 0x2)); |
| _FDT(fdt_property_cell(fdt, "#size-cells", 0x2)); |
| |
| /* RTAS */ |
| _FDT(fdt_begin_node(fdt, "rtas")); |
| /* This is what the kernel uses to switch 'We're an LPAR'! */ |
| _FDT(fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop_kvm, |
| sizeof(hypertas_prop_kvm))); |
| _FDT(fdt_property_cell(fdt, "linux,rtas-base", kvm->arch.rtas_gra)); |
| _FDT(fdt_property_cell(fdt, "linux,rtas-entry", kvm->arch.rtas_gra)); |
| _FDT(fdt_property_cell(fdt, "rtas-size", kvm->arch.rtas_size)); |
| /* Now add properties for all RTAS tokens: */ |
| if (spapr_rtas_fdt_setup(kvm, fdt)) |
| die("Couldn't create RTAS FDT properties\n"); |
| |
| _FDT(fdt_end_node(fdt)); |
| |
| /* /chosen */ |
| _FDT(fdt_begin_node(fdt, "chosen")); |
| /* cmdline */ |
| _FDT(fdt_property_string(fdt, "bootargs", kern_cmdline)); |
| /* Initrd */ |
| if (kvm->arch.initrd_size != 0) { |
| uint32_t ird_st_prop = cpu_to_be32(kvm->arch.initrd_gra); |
| uint32_t ird_end_prop = cpu_to_be32(kvm->arch.initrd_gra + |
| kvm->arch.initrd_size); |
| _FDT(fdt_property(fdt, "linux,initrd-start", |
| &ird_st_prop, sizeof(ird_st_prop))); |
| _FDT(fdt_property(fdt, "linux,initrd-end", |
| &ird_end_prop, sizeof(ird_end_prop))); |
| } |
| |
| /* |
| * stdout-path: This is assuming we're using the HV console. Also, the |
| * address is hardwired until we do a VIO bus. |
| */ |
| _FDT(fdt_property_string(fdt, "linux,stdout-path", |
| "/vdevice/vty@30000000")); |
| _FDT(fdt_end_node(fdt)); |
| |
| /* |
| * Memory: We don't alloc. a separate RMA yet. If we ever need to |
| * (CAP_PPC_RMA == 2) then have one memory node for 0->RMAsize, and |
| * another RMAsize->endOfMem. |
| */ |
| _FDT(fdt_begin_node(fdt, "memory@0")); |
| _FDT(fdt_property_string(fdt, "device_type", "memory")); |
| _FDT(fdt_property(fdt, "reg", mem_reg_property, |
| sizeof(mem_reg_property))); |
| _FDT(fdt_end_node(fdt)); |
| |
| generate_segment_page_sizes(&cpu_info->mmu_info, &segment_page_sizes); |
| |
| /* CPUs */ |
| _FDT(fdt_begin_node(fdt, "cpus")); |
| _FDT(fdt_property_cell(fdt, "#address-cells", 0x1)); |
| _FDT(fdt_property_cell(fdt, "#size-cells", 0x0)); |
| |
| for (i = 0; i < smp_cpus; i += SMT_THREADS) { |
| int32_t pft_size_prop[] = { 0, HPT_ORDER }; |
| uint32_t servers_prop[SMT_THREADS]; |
| uint32_t gservers_prop[SMT_THREADS * 2]; |
| int threads = (smp_cpus - i) >= SMT_THREADS ? SMT_THREADS : |
| smp_cpus - i; |
| |
| sprintf(cpu_name, "PowerPC,%s@%d", cpu_info->name, i); |
| _FDT(fdt_begin_node(fdt, cpu_name)); |
| sprintf(cpu_name, "PowerPC,%s", cpu_info->name); |
| _FDT(fdt_property_string(fdt, "name", cpu_name)); |
| _FDT(fdt_property_string(fdt, "device_type", "cpu")); |
| |
| _FDT(fdt_property_cell(fdt, "reg", i)); |
| _FDT(fdt_property_cell(fdt, "cpu-version", kvm->arch.pvr)); |
| |
| _FDT(fdt_property_cell(fdt, "dcache-block-size", cpu_info->d_bsize)); |
| _FDT(fdt_property_cell(fdt, "icache-block-size", cpu_info->i_bsize)); |
| |
| _FDT(fdt_property_cell(fdt, "timebase-frequency", cpu_info->tb_freq)); |
| /* Lies, but safeish lies! */ |
| _FDT(fdt_property_cell(fdt, "clock-frequency", 0xddbab200)); |
| |
| if (cpu_info->mmu_info.slb_size) |
| _FDT(fdt_property_cell(fdt, "ibm,slb-size", cpu_info->mmu_info.slb_size)); |
| |
| /* |
| * HPT size is hardwired; KVM currently fixes it at 16MB but the |
| * moment that changes we'll need to read it out of the kernel. |
| */ |
| _FDT(fdt_property(fdt, "ibm,pft-size", pft_size_prop, |
| sizeof(pft_size_prop))); |
| |
| _FDT(fdt_property_string(fdt, "status", "okay")); |
| _FDT(fdt_property(fdt, "64-bit", NULL, 0)); |
| /* A server for each thread in this core */ |
| for (j = 0; j < SMT_THREADS; j++) { |
| servers_prop[j] = cpu_to_be32(i+j); |
| /* |
| * Hack borrowed from QEMU, direct the group queues back |
| * to cpu 0: |
| */ |
| gservers_prop[j*2] = cpu_to_be32(i+j); |
| gservers_prop[j*2 + 1] = 0; |
| } |
| _FDT(fdt_property(fdt, "ibm,ppc-interrupt-server#s", |
| servers_prop, threads * sizeof(uint32_t))); |
| _FDT(fdt_property(fdt, "ibm,ppc-interrupt-gserver#s", |
| gservers_prop, |
| threads * 2 * sizeof(uint32_t))); |
| |
| if (segment_page_sizes.value) |
| _FDT(fdt_property(fdt, "ibm,segment-page-sizes", |
| segment_page_sizes.value, |
| segment_page_sizes.size)); |
| |
| if (cpu_info->mmu_info.flags & KVM_PPC_1T_SEGMENTS) |
| _FDT(fdt_property(fdt, "ibm,processor-segment-sizes", |
| segment_sizes_1T, sizeof(segment_sizes_1T))); |
| |
| /* VSX / DFP options: */ |
| if (cpu_info->flags & CPUINFO_FLAG_VMX) |
| _FDT(fdt_property_cell(fdt, "ibm,vmx", |
| (cpu_info->flags & |
| CPUINFO_FLAG_VSX) ? 2 : 1)); |
| if (cpu_info->flags & CPUINFO_FLAG_DFP) |
| _FDT(fdt_property_cell(fdt, "ibm,dfp", 0x1)); |
| _FDT(fdt_end_node(fdt)); |
| } |
| _FDT(fdt_end_node(fdt)); |
| |
| /* IRQ controller */ |
| _FDT(fdt_begin_node(fdt, "interrupt-controller@0")); |
| |
| _FDT(fdt_property_string(fdt, "device_type", |
| "PowerPC-External-Interrupt-Presentation")); |
| _FDT(fdt_property_string(fdt, "compatible", "IBM,ppc-xicp")); |
| _FDT(fdt_property_cell(fdt, "reg", 0)); |
| _FDT(fdt_property(fdt, "interrupt-controller", NULL, 0)); |
| _FDT(fdt_property(fdt, "ibm,interrupt-server-ranges", |
| int_server_ranges_prop, |
| sizeof(int_server_ranges_prop))); |
| _FDT(fdt_property_cell(fdt, "#interrupt-cells", 2)); |
| _FDT(fdt_property_cell(fdt, "linux,phandle", PHANDLE_XICP)); |
| _FDT(fdt_property_cell(fdt, "phandle", PHANDLE_XICP)); |
| _FDT(fdt_end_node(fdt)); |
| |
| /* |
| * VIO: See comment in linux,stdout-path; we don't yet represent a VIO |
| * bus/address allocation so addresses are hardwired here. |
| */ |
| _FDT(fdt_begin_node(fdt, "vdevice")); |
| _FDT(fdt_property_cell(fdt, "#address-cells", 0x1)); |
| _FDT(fdt_property_cell(fdt, "#size-cells", 0x0)); |
| _FDT(fdt_property_string(fdt, "device_type", "vdevice")); |
| _FDT(fdt_property_string(fdt, "compatible", "IBM,vdevice")); |
| _FDT(fdt_begin_node(fdt, "vty@30000000")); |
| _FDT(fdt_property_string(fdt, "name", "vty")); |
| _FDT(fdt_property_string(fdt, "device_type", "serial")); |
| _FDT(fdt_property_string(fdt, "compatible", "hvterm1")); |
| _FDT(fdt_property_cell(fdt, "reg", 0x30000000)); |
| _FDT(fdt_end_node(fdt)); |
| _FDT(fdt_end_node(fdt)); |
| |
| /* Finalise: */ |
| _FDT(fdt_end_node(fdt)); /* Root node */ |
| _FDT(fdt_finish(fdt)); |
| |
| _FDT(fdt_open_into(fdt, fdt_dest, FDT_MAX_SIZE)); |
| |
| /* PCI */ |
| if (spapr_populate_pci_devices(kvm, PHANDLE_XICP, fdt_dest)) |
| die("Fail populating PCI device nodes"); |
| |
| _FDT(fdt_add_mem_rsv(fdt_dest, kvm->arch.rtas_gra, kvm->arch.rtas_size)); |
| _FDT(fdt_pack(fdt_dest)); |
| |
| free(segment_page_sizes.value); |
| } |
| |
| /** |
| * kvm__arch_setup_firmware |
| */ |
| int kvm__arch_setup_firmware(struct kvm *kvm) |
| { |
| /* |
| * Set up RTAS stub. All it is is a single hypercall: |
| * 0: 7c 64 1b 78 mr r4,r3 |
| * 4: 3c 60 00 00 lis r3,0 |
| * 8: 60 63 f0 00 ori r3,r3,61440 |
| * c: 44 00 00 22 sc 1 |
| * 10: 4e 80 00 20 blr |
| */ |
| uint32_t *rtas = guest_flat_to_host(kvm, kvm->arch.rtas_gra); |
| |
| rtas[0] = 0x7c641b78; |
| rtas[1] = 0x3c600000; |
| rtas[2] = 0x6063f000; |
| rtas[3] = 0x44000022; |
| rtas[4] = 0x4e800020; |
| kvm->arch.rtas_size = 20; |
| |
| pr_info("Set up %ld bytes of RTAS at 0x%lx\n", |
| kvm->arch.rtas_size, kvm->arch.rtas_gra); |
| |
| /* Load SLOF */ |
| |
| /* Init FDT */ |
| setup_fdt(kvm); |
| |
| return 0; |
| } |
| |
| int kvm__arch_free_firmware(struct kvm *kvm) |
| { |
| return 0; |
| } |