| // Copyright 2017 The Chromium OS Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| extern crate arch; |
| extern crate byteorder; |
| extern crate data_model; |
| extern crate devices; |
| extern crate kvm; |
| extern crate kvm_sys; |
| extern crate libc; |
| extern crate sys_util; |
| extern crate resources; |
| extern crate kernel_cmdline; |
| extern crate kernel_loader; |
| |
| #[allow(dead_code)] |
| #[allow(non_upper_case_globals)] |
| #[allow(non_camel_case_types)] |
| #[allow(non_snake_case)] |
| mod bootparam; |
| // Bindgen didn't implement copy for boot_params because edid_info contains an array with len > 32. |
| impl Copy for bootparam::edid_info {} |
| impl Clone for bootparam::edid_info { |
| fn clone(&self) -> Self { |
| *self |
| } |
| } |
| impl Copy for bootparam::boot_params {} |
| impl Clone for bootparam::boot_params { |
| fn clone(&self) -> Self { |
| *self |
| } |
| } |
| // boot_params is just a series of ints, it is safe to initialize it. |
| unsafe impl data_model::DataInit for bootparam::boot_params {} |
| |
| #[allow(dead_code)] |
| #[allow(non_upper_case_globals)] |
| mod msr_index; |
| |
| #[allow(dead_code)] |
| #[allow(non_upper_case_globals)] |
| #[allow(non_camel_case_types)] |
| mod mpspec; |
| // These mpspec types are only data, reading them from data is a safe initialization. |
| unsafe impl data_model::DataInit for mpspec::mpc_bus {} |
| unsafe impl data_model::DataInit for mpspec::mpc_cpu {} |
| unsafe impl data_model::DataInit for mpspec::mpc_intsrc {} |
| unsafe impl data_model::DataInit for mpspec::mpc_ioapic {} |
| unsafe impl data_model::DataInit for mpspec::mpc_table {} |
| unsafe impl data_model::DataInit for mpspec::mpc_lintsrc {} |
| unsafe impl data_model::DataInit for mpspec::mpf_intel {} |
| |
| mod cpuid; |
| mod gdt; |
| mod interrupts; |
| mod mptable; |
| mod regs; |
| |
| use std::mem; |
| use std::result; |
| use std::error::{self, Error as X86Error}; |
| use std::fmt::{self, Display}; |
| use std::fs::File; |
| use std::ffi::{CStr, CString}; |
| use std::io::{self, stdout}; |
| use std::sync::{Arc, Mutex}; |
| |
| use arch::{RunnableLinuxVm, VirtioDeviceStub, VmComponents}; |
| use bootparam::boot_params; |
| use bootparam::E820_RAM; |
| use devices::PciInterruptPin; |
| use sys_util::{EventFd, GuestAddress, GuestMemory}; |
| use resources::{AddressRanges, SystemAllocator}; |
| use kvm::*; |
| |
| #[derive(Debug)] |
| pub enum Error { |
| /// Error configuring the system |
| ConfigureSystem, |
| /// Unable to clone an EventFd |
| CloneEventFd(sys_util::Error), |
| /// Error creating kernel command line. |
| Cmdline(kernel_cmdline::Error), |
| /// Unable to make an EventFd |
| CreateEventFd(sys_util::Error), |
| /// Unable to create Kvm. |
| CreateKvm(sys_util::Error), |
| /// Unable to create a PciRoot hub. |
| CreatePciRoot(arch::DeviceRegistrationError), |
| /// Unable to create socket. |
| CreateSocket(io::Error), |
| /// Unable to create Vcpu. |
| CreateVcpu(sys_util::Error), |
| /// The kernel extends past the end of RAM |
| KernelOffsetPastEnd, |
| /// Error registering an IrqFd |
| RegisterIrqfd(sys_util::Error), |
| /// Couldn't register virtio socket. |
| RegisterVsock(arch::DeviceRegistrationError), |
| LoadCmdline(kernel_loader::Error), |
| LoadKernel(kernel_loader::Error), |
| /// Error writing the zero page of guest memory. |
| ZeroPageSetup, |
| /// The zero page extends past the end of guest_mem. |
| ZeroPagePastRamEnd, |
| /// Invalid e820 setup params. |
| E820Configuration, |
| } |
| |
| impl error::Error for Error { |
| fn description(&self) -> &str { |
| match self { |
| &Error::ConfigureSystem => "Error configuring the system", |
| &Error::CloneEventFd(_) => "Unable to clone an EventFd", |
| &Error::Cmdline(_) => "the given kernel command line was invalid", |
| &Error::CreateEventFd(_) => "Unable to make an EventFd", |
| &Error::CreateKvm(_) => "failed to open /dev/kvm", |
| &Error::CreatePciRoot(_) => "failed to create a PCI root hub", |
| &Error::CreateSocket(_) => "failed to create socket", |
| &Error::CreateVcpu(_) => "failed to create VCPU", |
| &Error::KernelOffsetPastEnd => |
| "The kernel extends past the end of RAM", |
| &Error::RegisterIrqfd(_) => "Error registering an IrqFd", |
| &Error::RegisterVsock(_) => "error registering virtual socket device", |
| &Error::LoadCmdline(_) => "Error Loading command line", |
| &Error::LoadKernel(_) => "Error Loading Kernel", |
| &Error::ZeroPageSetup => |
| "Error writing the zero page of guest memory", |
| &Error::ZeroPagePastRamEnd => |
| "The zero page extends past the end of guest_mem", |
| &Error::E820Configuration => "Invalid e820 setup params", |
| } |
| } |
| } |
| |
| impl Display for Error { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| write!(f, "X86 Arch Error: {}", Error::description(self)) |
| } |
| } |
| |
| pub struct X8664arch; |
| pub type Result<T> = result::Result<T, Box<std::error::Error>>; |
| |
| const BOOT_STACK_POINTER: u64 = 0x8000; |
| const MEM_32BIT_GAP_SIZE: u64 = (768 << 20); |
| const FIRST_ADDR_PAST_32BITS: u64 = (1 << 32); |
| const KERNEL_64BIT_ENTRY_OFFSET: u64 = 0x200; |
| const ZERO_PAGE_OFFSET: u64 = 0x7000; |
| |
| const KERNEL_START_OFFSET: u64 = 0x200000; |
| const CMDLINE_OFFSET: u64 = 0x20000; |
| const CMDLINE_MAX_SIZE: u64 = KERNEL_START_OFFSET - CMDLINE_OFFSET; |
| const X86_64_IRQ_BASE: u32 = 5; |
| |
| fn configure_system(guest_mem: &GuestMemory, |
| kernel_addr: GuestAddress, |
| cmdline_addr: GuestAddress, |
| cmdline_size: usize, |
| num_cpus: u8, |
| pci_irqs: Vec<(u32, PciInterruptPin)>) |
| -> Result<()> { |
| const EBDA_START: u64 = 0x0009fc00; |
| const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55; |
| const KERNEL_HDR_MAGIC: u32 = 0x53726448; |
| const KERNEL_LOADER_OTHER: u8 = 0xff; |
| const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x1000000; // Must be non-zero. |
| let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); |
| let end_32bit_gap_start = GuestAddress(FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE); |
| |
| // Note that this puts the mptable at 0x0 in guest physical memory. |
| mptable::setup_mptable(guest_mem, num_cpus, pci_irqs)?; |
| |
| let mut params: boot_params = Default::default(); |
| |
| params.hdr.type_of_loader = KERNEL_LOADER_OTHER; |
| params.hdr.boot_flag = KERNEL_BOOT_FLAG_MAGIC; |
| params.hdr.header = KERNEL_HDR_MAGIC; |
| params.hdr.cmd_line_ptr = cmdline_addr.offset() as u32; |
| params.hdr.cmdline_size = cmdline_size as u32; |
| params.hdr.kernel_alignment = KERNEL_MIN_ALIGNMENT_BYTES; |
| |
| add_e820_entry(&mut params, 0, EBDA_START, E820_RAM)?; |
| |
| let mem_end = guest_mem.end_addr(); |
| if mem_end < end_32bit_gap_start { |
| add_e820_entry(&mut params, |
| kernel_addr.offset() as u64, |
| mem_end.offset_from(kernel_addr) as u64, |
| E820_RAM)?; |
| } else { |
| add_e820_entry(&mut params, |
| kernel_addr.offset() as u64, |
| end_32bit_gap_start.offset_from(kernel_addr) as u64, |
| E820_RAM)?; |
| if mem_end > first_addr_past_32bits { |
| add_e820_entry(&mut params, |
| first_addr_past_32bits.offset() as u64, |
| mem_end.offset_from(first_addr_past_32bits) as u64, |
| E820_RAM)?; |
| } |
| } |
| |
| let zero_page_addr = GuestAddress(ZERO_PAGE_OFFSET); |
| guest_mem.checked_offset(zero_page_addr, mem::size_of::<boot_params>() as u64) |
| .ok_or(Error::ZeroPagePastRamEnd)?; |
| guest_mem.write_obj_at_addr(params, zero_page_addr) |
| .map_err(|_| Error::ZeroPageSetup)?; |
| |
| Ok(()) |
| } |
| |
| /// Add an e820 region to the e820 map. |
| /// Returns Ok(()) if successful, or an error if there is no space left in the map. |
| fn add_e820_entry(params: &mut boot_params, addr: u64, size: u64, mem_type: u32) -> Result<()> { |
| if params.e820_entries >= params.e820_map.len() as u8 { |
| return Err(Box::new(Error::E820Configuration)); |
| } |
| |
| params.e820_map[params.e820_entries as usize].addr = addr; |
| params.e820_map[params.e820_entries as usize].size = size; |
| params.e820_map[params.e820_entries as usize].type_ = mem_type; |
| params.e820_entries += 1; |
| |
| Ok(()) |
| } |
| |
| /// Returns a Vec of the valid memory addresses. |
| /// These should be used to configure the GuestMemory structure for the platfrom. |
| /// For x86_64 all addresses are valid from the start of the kenel except a |
| /// carve out at the end of 32bit address space. |
| fn arch_memory_regions(size: u64) -> Vec<(GuestAddress, u64)> { |
| let mem_end = GuestAddress(size); |
| let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); |
| let end_32bit_gap_start = GuestAddress(FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE); |
| |
| let mut regions = Vec::new(); |
| if mem_end < end_32bit_gap_start { |
| regions.push((GuestAddress(0), size)); |
| } else { |
| regions.push((GuestAddress(0), end_32bit_gap_start.offset())); |
| if mem_end > first_addr_past_32bits { |
| regions.push((first_addr_past_32bits, mem_end.offset_from(first_addr_past_32bits))); |
| } |
| } |
| |
| regions |
| } |
| |
| impl arch::LinuxArch for X8664arch { |
| fn build_vm<F>(mut components: VmComponents, virtio_devs: F) -> Result<RunnableLinuxVm> |
| where |
| F: FnOnce(&GuestMemory, &EventFd) -> Result<Vec<VirtioDeviceStub>> |
| { |
| let mut resources = Self::get_resource_allocator(components.memory_mb, |
| components.wayland_dmabuf); |
| let mem = Self::setup_memory(components.memory_mb)?; |
| let kvm = Kvm::new().map_err(Error::CreateKvm)?; |
| let mut vm = Self::create_vm(&kvm, mem.clone())?; |
| |
| let vcpu_count = components.vcpu_count; |
| let mut vcpus = Vec::with_capacity(vcpu_count as usize); |
| for cpu_id in 0..vcpu_count { |
| let vcpu = Vcpu::new(cpu_id as libc::c_ulong, &kvm, &vm) |
| .map_err(Error::CreateVcpu)?; |
| Self::configure_vcpu(vm.get_memory(), &kvm, &vm, &vcpu, |
| cpu_id as u64, vcpu_count as u64)?; |
| vcpus.push(vcpu); |
| } |
| |
| let irq_chip = Self::create_irq_chip(&vm)?; |
| let mut cmdline = Self::get_base_linux_cmdline(); |
| |
| let mut mmio_bus = devices::Bus::new(); |
| |
| let (pci, pci_irqs) = arch::generate_pci_root(components.pci_devices, |
| &mut mmio_bus, |
| &mut resources) |
| .map_err(Error::CreatePciRoot)?; |
| let pci_bus = Arc::new(Mutex::new(pci)); |
| |
| let exit_evt = EventFd::new().map_err(Error::CreateEventFd)?; |
| let (io_bus, stdio_serial) = Self::setup_io_bus( |
| &mut vm, |
| exit_evt.try_clone().map_err(Error::CloneEventFd)?, |
| Some(pci_bus.clone()))?; |
| |
| |
| // Create a list of mmio devices to be added. |
| let mmio_devs = virtio_devs(&mem, &exit_evt)?; |
| |
| for stub in mmio_devs { |
| arch::register_mmio(&mut mmio_bus, &mut vm, stub.dev, stub.jail, |
| &mut resources, &mut cmdline) |
| .map_err(Error::RegisterVsock)?; |
| } |
| |
| for param in components.extra_kernel_params { |
| cmdline.insert_str(¶m).map_err(Error::Cmdline)?; |
| } |
| |
| // separate out load_kernel from other setup to get a specific error for |
| // kernel loading |
| Self::load_kernel(&mem, &mut components.kernel_image)?; |
| Self::setup_system_memory(&mem, components.memory_mb, vcpu_count, |
| &CString::new(cmdline).unwrap(), pci_irqs)?; |
| |
| Ok(RunnableLinuxVm { |
| vm, |
| kvm, |
| resources, |
| stdio_serial, |
| exit_evt, |
| vcpus, |
| irq_chip, |
| io_bus, |
| mmio_bus, |
| }) |
| } |
| } |
| |
| impl X8664arch { |
| /// Loads the kernel from an open file. |
| /// |
| /// # Arguments |
| /// |
| /// * `mem` - The memory to be used by the guest. |
| /// * `kernel_image` - the File object for the specified kernel. |
| fn load_kernel(mem: &GuestMemory, mut kernel_image: &mut File) -> Result<()> { |
| kernel_loader::load_kernel(mem, GuestAddress(KERNEL_START_OFFSET), |
| &mut kernel_image)?; |
| Ok(()) |
| } |
| |
| /// Configures the system memory space should be called once per vm before |
| /// starting vcpu threads. |
| /// |
| /// # Arguments |
| /// |
| /// * `mem` - The memory to be used by the guest. |
| /// * `vcpu_count` - Number of virtual CPUs the guest will have. |
| /// * `cmdline` - the kernel commandline |
| fn setup_system_memory(mem: &GuestMemory, _mem_size: u64, |
| vcpu_count: u32, cmdline: &CStr, |
| pci_irqs: Vec<(u32, PciInterruptPin)>) -> Result<()> { |
| kernel_loader::load_cmdline(mem, GuestAddress(CMDLINE_OFFSET), cmdline)?; |
| configure_system(mem, GuestAddress(KERNEL_START_OFFSET), |
| GuestAddress(CMDLINE_OFFSET), |
| cmdline.to_bytes().len() + 1, vcpu_count as u8, pci_irqs)?; |
| Ok(()) |
| } |
| |
| /// Creates a new VM object and initializes architecture specific devices |
| /// |
| /// # Arguments |
| /// |
| /// * `kvm` - The opened /dev/kvm object. |
| /// * `mem` - The memory to be used by the guest. |
| fn create_vm(kvm: &Kvm, mem: GuestMemory) -> Result<Vm> { |
| let vm = Vm::new(&kvm, mem)?; |
| let tss_addr = GuestAddress(0xfffbd000); |
| vm.set_tss_addr(tss_addr).expect("set tss addr failed"); |
| vm.create_pit().expect("create pit failed"); |
| vm.create_irq_chip()?; |
| Ok(vm) |
| } |
| |
| /// This creates a GuestMemory object for this VM |
| /// |
| /// * `mem_size` - Desired physical memory size in bytes for this VM |
| fn setup_memory(mem_size: u64) -> Result<sys_util::GuestMemory> { |
| let arch_mem_regions = arch_memory_regions(mem_size); |
| let mem = GuestMemory::new(&arch_mem_regions)?; |
| Ok(mem) |
| } |
| |
| /// The creates the interrupt controller device and optionally returns the fd for it. |
| /// Some architectures may not have a separate descriptor for the interrupt |
| /// controller, so they would return None even on success. |
| /// |
| /// # Arguments |
| /// |
| /// * `vm` - the vm object |
| fn create_irq_chip(_vm: &kvm::Vm) -> Result<Option<File>> { |
| // Unfortunately X86 and ARM have to do this in completely different order |
| // X86 needs to create the irq chip before creating cpus and |
| // ARM needs to do it afterwards. |
| Ok(None) |
| } |
| |
| /// This returns the first page frame number for use by the balloon driver. |
| /// |
| /// # Arguments |
| /// |
| /// * `mem_size` - the size in bytes of physical ram for the guest |
| fn get_base_dev_pfn(mem_size: u64) -> u64 { |
| // Put device memory at a 2MB boundary after physical memory or 4gb, whichever is greater. |
| const MB: u64 = 1024 * 1024; |
| const GB: u64 = 1024 * MB; |
| let mem_size_round_2mb = (mem_size + 2*MB - 1) / (2*MB) * (2*MB); |
| std::cmp::max(mem_size_round_2mb, 4 * GB) / sys_util::pagesize() as u64 |
| } |
| |
| /// This returns a minimal kernel command for this architecture |
| fn get_base_linux_cmdline() -> kernel_cmdline::Cmdline { |
| let mut cmdline = kernel_cmdline::Cmdline::new(CMDLINE_MAX_SIZE as usize); |
| cmdline.insert_str("console=ttyS0 noacpi reboot=k panic=1"). |
| unwrap(); |
| cmdline |
| } |
| |
| /// Returns a system resource allocator. |
| fn get_resource_allocator(mem_size: u64, gpu_allocation: bool) -> SystemAllocator { |
| const MMIO_BASE: u64 = 0xe0000000; |
| let device_addr_start = Self::get_base_dev_pfn(mem_size) * sys_util::pagesize() as u64; |
| AddressRanges::new() |
| .add_io_addresses(0xc000, 0x10000) |
| .add_mmio_addresses(MMIO_BASE, 0x10000) |
| .add_device_addresses(device_addr_start, u64::max_value() - device_addr_start) |
| .create_allocator(X86_64_IRQ_BASE, gpu_allocation).unwrap() |
| } |
| |
| /// Sets up the IO bus for this platform |
| /// |
| /// # Arguments |
| /// |
| /// * - `vm` the vm object |
| /// * - `exit_evt` - the event fd object which should receive exit events |
| fn setup_io_bus(vm: &mut Vm, exit_evt: EventFd, pci: Option<Arc<Mutex<devices::PciRoot>>>) |
| -> Result<(devices::Bus, Arc<Mutex<devices::Serial>>)> { |
| struct NoDevice; |
| impl devices::BusDevice for NoDevice {} |
| |
| let mut io_bus = devices::Bus::new(); |
| |
| let com_evt_1_3 = EventFd::new().map_err(|e| Error::CreateEventFd(e))?; |
| let com_evt_2_4 = EventFd::new().map_err(|e| Error::CreateEventFd(e))?; |
| let stdio_serial = |
| Arc::new(Mutex::new( |
| devices::Serial::new_out(com_evt_1_3.try_clone(). |
| map_err(|e| Error::CloneEventFd(e))?, |
| Box::new(stdout())))); |
| let nul_device = Arc::new(Mutex::new(NoDevice)); |
| io_bus.insert(stdio_serial.clone(), 0x3f8, 0x8, false).unwrap(); |
| io_bus.insert(Arc::new(Mutex::new( |
| devices::Serial::new_sink(com_evt_2_4.try_clone(). |
| map_err(|e| Error::CloneEventFd(e))?))), |
| 0x2f8, |
| 0x8, |
| false) |
| .unwrap(); |
| io_bus.insert(Arc::new(Mutex::new( |
| devices::Serial::new_sink(com_evt_1_3.try_clone(). |
| map_err(|e| Error::CloneEventFd(e))?))), |
| 0x3e8, |
| 0x8, |
| false) |
| .unwrap(); |
| io_bus.insert(Arc::new(Mutex::new( |
| devices::Serial::new_sink(com_evt_2_4.try_clone(). |
| map_err(|e| Error::CloneEventFd(e))?))), |
| 0x2e8, |
| 0x8, |
| false) |
| .unwrap(); |
| io_bus.insert(Arc::new(Mutex::new(devices::Cmos::new())), 0x70, 0x2, false) |
| .unwrap(); |
| io_bus.insert(Arc::new(Mutex::new( |
| devices::I8042Device::new(exit_evt.try_clone(). |
| map_err(|e| Error::CloneEventFd(e))?))), |
| 0x061, |
| 0x4, |
| false) |
| .unwrap(); |
| io_bus.insert(nul_device.clone(), 0x040, 0x8, false).unwrap(); // ignore pit |
| io_bus.insert(nul_device.clone(), 0x0ed, 0x1, false).unwrap(); // most likely this one does nothing |
| io_bus.insert(nul_device.clone(), 0x0f0, 0x2, false).unwrap(); // ignore fpu |
| |
| if let Some(pci_root) = pci { |
| io_bus.insert(pci_root, 0xcf8, 0x8, false).unwrap(); |
| } else { |
| // ignore pci. |
| io_bus.insert(nul_device.clone(), 0xcf8, 0x8, false).unwrap(); |
| } |
| |
| vm.register_irqfd(&com_evt_1_3, 4).map_err(Error::RegisterIrqfd)?; |
| vm.register_irqfd(&com_evt_2_4, 3).map_err(Error::RegisterIrqfd)?; |
| |
| Ok((io_bus, stdio_serial)) |
| } |
| |
| /// Configures the vcpu and should be called once per vcpu from the vcpu's thread. |
| /// |
| /// # Arguments |
| /// |
| /// * `guest_mem` - The memory to be used by the guest. |
| /// * `kernel_load_offset` - Offset in bytes from `guest_mem` at which the |
| /// kernel starts. |
| /// * `kvm` - The /dev/kvm object that created vcpu. |
| /// * `vm` - The VM object associated with this VCPU. |
| /// * `vcpu` - The VCPU object to configure. |
| /// * `cpu_id` - The id of the given `vcpu`. |
| /// * `num_cpus` - Number of virtual CPUs the guest will have. |
| fn configure_vcpu(guest_mem: &GuestMemory, |
| kvm: &Kvm, |
| _vm: &Vm, |
| vcpu: &Vcpu, |
| cpu_id: u64, |
| num_cpus: u64) |
| -> Result<()> { |
| let kernel_load_addr = GuestAddress(KERNEL_START_OFFSET); |
| cpuid::setup_cpuid(kvm, vcpu, cpu_id, num_cpus)?; |
| regs::setup_msrs(vcpu)?; |
| let kernel_end = guest_mem.checked_offset(kernel_load_addr, KERNEL_64BIT_ENTRY_OFFSET) |
| .ok_or(Error::KernelOffsetPastEnd)?; |
| regs::setup_regs(vcpu, |
| (kernel_end).offset() as u64, |
| BOOT_STACK_POINTER as u64, |
| ZERO_PAGE_OFFSET as u64)?; |
| regs::setup_fpu(vcpu)?; |
| regs::setup_sregs(guest_mem, vcpu)?; |
| interrupts::set_lint(vcpu)?; |
| Ok(()) |
| } |
| |
| } |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| |
| #[test] |
| fn regions_lt_4gb() { |
| let regions = arch_memory_regions(1u64 << 29); |
| assert_eq!(1, regions.len()); |
| assert_eq!(GuestAddress(0), regions[0].0); |
| assert_eq!(1u64 << 29, regions[0].1); |
| } |
| |
| #[test] |
| fn regions_gt_4gb() { |
| let regions = arch_memory_regions((1u64 << 32) + 0x8000); |
| assert_eq!(2, regions.len()); |
| assert_eq!(GuestAddress(0), regions[0].0); |
| assert_eq!(GuestAddress(1u64 << 32), regions[1].0); |
| } |
| } |