| // Copyright 2017 The ChromiumOS Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| //! x86 architecture support. |
| |
| #![cfg(target_arch = "x86_64")] |
| |
| mod fdt; |
| |
| #[cfg(feature = "gdb")] |
| mod gdb; |
| |
| const SETUP_DTB: u32 = 2; |
| const SETUP_RNG_SEED: u32 = 9; |
| |
| #[allow(dead_code)] |
| #[allow(non_upper_case_globals)] |
| #[allow(non_camel_case_types)] |
| #[allow(non_snake_case)] |
| pub mod bootparam; |
| |
| #[allow(dead_code)] |
| #[allow(non_upper_case_globals)] |
| mod msr_index; |
| |
| #[allow(dead_code)] |
| #[allow(non_upper_case_globals)] |
| #[allow(non_camel_case_types)] |
| #[allow(clippy::all)] |
| mod mpspec; |
| |
| pub mod multiboot_spec; |
| |
| pub mod acpi; |
| mod bzimage; |
| pub mod cpuid; |
| mod gdt; |
| pub mod interrupts; |
| pub mod mptable; |
| pub mod regs; |
| pub mod smbios; |
| |
| use std::arch::x86_64::CpuidResult; |
| use std::cmp::min; |
| use std::collections::BTreeMap; |
| use std::fmt; |
| use std::fs::File; |
| use std::io; |
| use std::io::Write; |
| use std::mem; |
| use std::path::PathBuf; |
| use std::sync::mpsc; |
| use std::sync::Arc; |
| |
| use acpi_tables::aml; |
| use acpi_tables::aml::Aml; |
| use acpi_tables::sdt::SDT; |
| use anyhow::Context; |
| use arch::get_serial_cmdline; |
| use arch::serial::SerialDeviceInfo; |
| use arch::CpuSet; |
| use arch::DtbOverlay; |
| use arch::FdtPosition; |
| use arch::GetSerialCmdlineError; |
| use arch::MemoryRegionConfig; |
| use arch::PciConfig; |
| use arch::RunnableLinuxVm; |
| use arch::VmComponents; |
| use arch::VmImage; |
| use base::debug; |
| use base::info; |
| use base::warn; |
| #[cfg(any(target_os = "android", target_os = "linux"))] |
| use base::AsRawDescriptors; |
| use base::Event; |
| use base::FileGetLen; |
| use base::FileReadWriteAtVolatile; |
| use base::SendTube; |
| use base::Tube; |
| use base::TubeError; |
| use chrono::Utc; |
| pub use cpuid::adjust_cpuid; |
| pub use cpuid::CpuIdContext; |
| use devices::acpi::PM_WAKEUP_GPIO; |
| use devices::Bus; |
| use devices::BusDevice; |
| use devices::BusDeviceObj; |
| use devices::BusResumeDevice; |
| use devices::BusType; |
| use devices::Debugcon; |
| use devices::FwCfgParameters; |
| use devices::IrqChip; |
| use devices::IrqChipX86_64; |
| use devices::IrqEventSource; |
| use devices::PciAddress; |
| use devices::PciConfigIo; |
| use devices::PciConfigMmio; |
| use devices::PciDevice; |
| use devices::PciInterruptPin; |
| use devices::PciRoot; |
| use devices::PciRootCommand; |
| use devices::PciVirtualConfigMmio; |
| use devices::Pflash; |
| #[cfg(any(target_os = "android", target_os = "linux"))] |
| use devices::ProxyDevice; |
| use devices::Serial; |
| use devices::SerialHardware; |
| use devices::SerialParameters; |
| use devices::VirtualPmc; |
| use devices::FW_CFG_BASE_PORT; |
| use devices::FW_CFG_MAX_FILE_SLOTS; |
| use devices::FW_CFG_WIDTH; |
| use hypervisor::CpuConfigX86_64; |
| use hypervisor::Hypervisor; |
| use hypervisor::HypervisorX86_64; |
| use hypervisor::ProtectionType; |
| use hypervisor::VcpuInitX86_64; |
| use hypervisor::VcpuX86_64; |
| use hypervisor::Vm; |
| use hypervisor::VmCap; |
| use hypervisor::VmX86_64; |
| #[cfg(feature = "seccomp_trace")] |
| use jail::read_jail_addr; |
| #[cfg(windows)] |
| use jail::FakeMinijailStub as Minijail; |
| #[cfg(any(target_os = "android", target_os = "linux"))] |
| use minijail::Minijail; |
| use mptable::MPTABLE_RANGE; |
| use multiboot_spec::MultibootInfo; |
| use multiboot_spec::MultibootMmapEntry; |
| use multiboot_spec::MULTIBOOT_BOOTLOADER_MAGIC; |
| use remain::sorted; |
| use resources::AddressRange; |
| use resources::SystemAllocator; |
| use resources::SystemAllocatorConfig; |
| use sync::Condvar; |
| use sync::Mutex; |
| use thiserror::Error; |
| use vm_control::BatControl; |
| use vm_control::BatteryType; |
| use vm_memory::GuestAddress; |
| use vm_memory::GuestMemory; |
| use vm_memory::GuestMemoryError; |
| use vm_memory::MemoryRegionOptions; |
| use vm_memory::MemoryRegionPurpose; |
| use zerocopy::FromBytes; |
| use zerocopy::Immutable; |
| use zerocopy::IntoBytes; |
| use zerocopy::KnownLayout; |
| |
| use crate::bootparam::boot_params; |
| use crate::bootparam::setup_header; |
| use crate::bootparam::XLF_CAN_BE_LOADED_ABOVE_4G; |
| use crate::cpuid::EDX_HYBRID_CPU_SHIFT; |
| |
| #[sorted] |
| #[derive(Error, Debug)] |
| pub enum Error { |
| #[error("error allocating a single gpe")] |
| AllocateGpe, |
| #[error("error allocating IO resource: {0}")] |
| AllocateIOResouce(resources::Error), |
| #[error("error allocating a single irq")] |
| AllocateIrq, |
| #[error("unable to clone an Event: {0}")] |
| CloneEvent(base::Error), |
| #[error("failed to clone IRQ chip: {0}")] |
| CloneIrqChip(base::Error), |
| #[cfg(any(target_os = "android", target_os = "linux"))] |
| #[error("failed to clone jail: {0}")] |
| CloneJail(minijail::Error), |
| #[error("unable to clone a Tube: {0}")] |
| CloneTube(TubeError), |
| #[error("the given kernel command line was invalid: {0}")] |
| Cmdline(kernel_cmdline::Error), |
| #[error("failed writing command line to guest memory")] |
| CommandLineCopy, |
| #[error("command line overflowed guest memory")] |
| CommandLineOverflow, |
| #[error("failed to configure hotplugged pci device: {0}")] |
| ConfigurePciDevice(arch::DeviceRegistrationError), |
| #[error("bad PCI ECAM configuration: {0}")] |
| ConfigurePciEcam(String), |
| #[error("bad PCI mem configuration: {0}")] |
| ConfigurePciMem(String), |
| #[error("failed to configure segment registers: {0}")] |
| ConfigureSegments(regs::Error), |
| #[error("error configuring the system")] |
| ConfigureSystem, |
| #[error("unable to create ACPI tables")] |
| CreateAcpi, |
| #[error("unable to create battery devices: {0}")] |
| CreateBatDevices(arch::DeviceRegistrationError), |
| #[error("could not create debugcon device: {0}")] |
| CreateDebugconDevice(devices::SerialError), |
| #[error("unable to make an Event: {0}")] |
| CreateEvent(base::Error), |
| #[error("failed to create fdt: {0}")] |
| CreateFdt(cros_fdt::Error), |
| #[error("failed to create fw_cfg device: {0}")] |
| CreateFwCfgDevice(devices::FwCfgError), |
| #[error("failed to create IOAPIC device: {0}")] |
| CreateIoapicDevice(base::Error), |
| #[error("failed to create a PCI root hub: {0}")] |
| CreatePciRoot(arch::DeviceRegistrationError), |
| #[error("unable to create PIT: {0}")] |
| CreatePit(base::Error), |
| #[error("unable to make PIT device: {0}")] |
| CreatePitDevice(devices::PitError), |
| #[cfg(any(target_os = "android", target_os = "linux"))] |
| #[error("unable to create proxy device: {0}")] |
| CreateProxyDevice(devices::ProxyError), |
| #[error("unable to create serial devices: {0}")] |
| CreateSerialDevices(arch::DeviceRegistrationError), |
| #[error("failed to create socket: {0}")] |
| CreateSocket(io::Error), |
| #[error("failed to create tube: {0}")] |
| CreateTube(base::TubeError), |
| #[error("failed to create VCPU: {0}")] |
| CreateVcpu(base::Error), |
| #[error("DTB size is larger than the allowed size")] |
| DTBSizeGreaterThanAllowed, |
| #[error("invalid e820 setup params")] |
| E820Configuration, |
| #[error("failed to enable singlestep execution: {0}")] |
| EnableSinglestep(base::Error), |
| #[error("failed to enable split irqchip: {0}")] |
| EnableSplitIrqchip(base::Error), |
| #[error("failed to get serial cmdline: {0}")] |
| GetSerialCmdline(GetSerialCmdlineError), |
| #[error("failed to insert device onto bus: {0}")] |
| InsertBus(devices::BusError), |
| #[error("the kernel extends past the end of RAM")] |
| InvalidCpuConfig, |
| #[error("invalid CPU config parameters")] |
| KernelOffsetPastEnd, |
| #[error("error loading bios: {0}")] |
| LoadBios(io::Error), |
| #[error("error loading kernel bzImage: {0}")] |
| LoadBzImage(bzimage::Error), |
| #[error("error loading custom pVM firmware: {0}")] |
| LoadCustomPvmFw(arch::LoadImageError), |
| #[error("error loading initrd: {0}")] |
| LoadInitrd(arch::LoadImageError), |
| #[error("error loading Kernel: {0}")] |
| LoadKernel(kernel_loader::Error), |
| #[error("error loading pflash: {0}")] |
| LoadPflash(io::Error), |
| #[error("error loading pVM firmware: {0}")] |
| LoadPvmFw(base::Error), |
| #[error("error in multiboot_info setup")] |
| MultibootInfoSetup, |
| #[error("error translating address: Page not present")] |
| PageNotPresent, |
| #[error("pci mmio overlaps with pVM firmware memory")] |
| PciMmioOverlapPvmFw, |
| #[error("pVM firmware not supported when bios is used on x86_64")] |
| PvmFwBiosUnsupported, |
| #[error("error reading guest memory {0}")] |
| ReadingGuestMemory(vm_memory::GuestMemoryError), |
| #[error("single register read not supported on x86_64")] |
| ReadRegIsUnsupported, |
| #[error("error reading CPU registers {0}")] |
| ReadRegs(base::Error), |
| #[error("error registering an IrqFd: {0}")] |
| RegisterIrqfd(base::Error), |
| #[error("error registering virtual socket device: {0}")] |
| RegisterVsock(arch::DeviceRegistrationError), |
| #[error("error reserved pcie config mmio")] |
| ReservePcieCfgMmio(resources::Error), |
| #[error("failed to set a hardware breakpoint: {0}")] |
| SetHwBreakpoint(base::Error), |
| #[error("failed to set identity map addr: {0}")] |
| SetIdentityMapAddr(base::Error), |
| #[error("failed to set interrupts: {0}")] |
| SetLint(interrupts::Error), |
| #[error("failed to set tss addr: {0}")] |
| SetTssAddr(base::Error), |
| #[error("failed to set up cmos: {0}")] |
| SetupCmos(anyhow::Error), |
| #[error("failed to set up cpuid: {0}")] |
| SetupCpuid(cpuid::Error), |
| #[error("setup data too large")] |
| SetupDataTooLarge, |
| #[error("failed to set up FPU: {0}")] |
| SetupFpu(base::Error), |
| #[error("failed to set up guest memory: {0}")] |
| SetupGuestMemory(GuestMemoryError), |
| #[error("failed to set up mptable: {0}")] |
| SetupMptable(mptable::Error), |
| #[error("failed to set up MSRs: {0}")] |
| SetupMsrs(base::Error), |
| #[error("failed to set up page tables: {0}")] |
| SetupPageTables(regs::Error), |
| #[error("failed to set up pflash: {0}")] |
| SetupPflash(anyhow::Error), |
| #[error("failed to set up registers: {0}")] |
| SetupRegs(regs::Error), |
| #[error("failed to set up SMBIOS: {0}")] |
| SetupSmbios(smbios::Error), |
| #[error("failed to set up sregs: {0}")] |
| SetupSregs(base::Error), |
| #[error("too many vCPUs")] |
| TooManyVcpus, |
| #[error("failed to translate virtual address")] |
| TranslatingVirtAddr, |
| #[error("protected VMs not supported on x86_64")] |
| UnsupportedProtectionType, |
| #[error("single register write not supported on x86_64")] |
| WriteRegIsUnsupported, |
| #[error("error writing CPU registers {0}")] |
| WriteRegs(base::Error), |
| #[error("error writing guest memory {0}")] |
| WritingGuestMemory(GuestMemoryError), |
| #[error("error writing setup_data: {0}")] |
| WritingSetupData(GuestMemoryError), |
| #[error("the zero page extends past the end of guest_mem")] |
| ZeroPagePastRamEnd, |
| #[error("error writing the zero page of guest memory")] |
| ZeroPageSetup, |
| } |
| |
| pub type Result<T> = std::result::Result<T, Error>; |
| |
| pub struct X8664arch; |
| |
| // Like `bootparam::setup_data` without the incomplete array field at the end, which allows us to |
| // safely implement Copy, Clone |
| #[repr(C)] |
| #[derive(Copy, Clone, Default, FromBytes, Immutable, IntoBytes, KnownLayout)] |
| struct setup_data_hdr { |
| pub next: u64, |
| pub type_: u32, |
| pub len: u32, |
| } |
| |
| #[repr(u32)] |
| #[derive(Copy, Clone, Debug, PartialEq, Eq)] |
| pub enum SetupDataType { |
| Dtb = SETUP_DTB, |
| RngSeed = SETUP_RNG_SEED, |
| } |
| |
| /// A single entry to be inserted in the bootparam `setup_data` linked list. |
| pub struct SetupData { |
| pub data: Vec<u8>, |
| pub type_: SetupDataType, |
| } |
| |
| impl SetupData { |
| /// Returns the length of the data |
| pub fn size(&self) -> usize { |
| self.data.len() |
| } |
| } |
| |
| /// Collection of SetupData entries to be inserted in the |
| /// bootparam `setup_data` linked list. |
| pub struct SetupDataEntries { |
| entries: Vec<SetupData>, |
| setup_data_start: usize, |
| setup_data_end: usize, |
| available_size: usize, |
| } |
| |
| impl SetupDataEntries { |
| /// Returns a new instance of SetupDataEntries |
| pub fn new(setup_data_start: usize, setup_data_end: usize) -> SetupDataEntries { |
| SetupDataEntries { |
| entries: Vec::new(), |
| setup_data_start, |
| setup_data_end, |
| available_size: setup_data_end - setup_data_start, |
| } |
| } |
| |
| /// Adds a new SetupDataEntry and returns the remaining size available |
| pub fn insert(&mut self, setup_data: SetupData) -> usize { |
| self.available_size -= setup_data.size(); |
| self.entries.push(setup_data); |
| |
| self.available_size |
| } |
| |
| /// Copy setup_data entries to guest memory and link them together with the `next` field. |
| /// Returns the guest address of the first entry in the setup_data list, if any. |
| pub fn write_setup_data(&self, guest_mem: &GuestMemory) -> Result<Option<GuestAddress>> { |
| write_setup_data( |
| guest_mem, |
| GuestAddress(self.setup_data_start as u64), |
| GuestAddress(self.setup_data_end as u64), |
| &self.entries, |
| ) |
| } |
| } |
| |
| #[derive(Copy, Clone, Debug)] |
| enum E820Type { |
| Ram = 0x01, |
| Reserved = 0x2, |
| } |
| |
| #[derive(Copy, Clone, Debug)] |
| struct E820Entry { |
| pub address: GuestAddress, |
| pub len: u64, |
| pub mem_type: E820Type, |
| } |
| |
| const KB: u64 = 1 << 10; |
| const MB: u64 = 1 << 20; |
| const GB: u64 = 1 << 30; |
| |
| pub const BOOT_STACK_POINTER: u64 = 0x8000; |
| const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32; |
| // Make sure it align to 256MB for MTRR convenient |
| const MEM_32BIT_GAP_SIZE: u64 = 768 * MB; |
| // Reserved memory for nand_bios/LAPIC/IOAPIC/HPET/..... |
| const RESERVED_MEM_SIZE: u64 = 0x800_0000; |
| const DEFAULT_PCI_MEM_END: u64 = FIRST_ADDR_PAST_32BITS - RESERVED_MEM_SIZE - 1; |
| // Reserve 64MB for pcie enhanced configuration |
| const DEFAULT_PCIE_CFG_MMIO_SIZE: u64 = 0x400_0000; |
| const DEFAULT_PCIE_CFG_MMIO_END: u64 = FIRST_ADDR_PAST_32BITS - RESERVED_MEM_SIZE - 1; |
| const DEFAULT_PCIE_CFG_MMIO_START: u64 = DEFAULT_PCIE_CFG_MMIO_END - DEFAULT_PCIE_CFG_MMIO_SIZE + 1; |
| // Linux (with 4-level paging) has a physical memory limit of 46 bits (64 TiB). |
| const HIGH_MMIO_MAX_END: u64 = (1u64 << 46) - 1; |
| pub const KERNEL_32BIT_ENTRY_OFFSET: u64 = 0x0; |
| pub const KERNEL_64BIT_ENTRY_OFFSET: u64 = 0x200; |
| pub const MULTIBOOT_INFO_OFFSET: u64 = 0x6000; |
| pub const MULTIBOOT_INFO_SIZE: u64 = 0x1000; |
| pub const ZERO_PAGE_OFFSET: u64 = 0x7000; |
| // Set BIOS max size to 16M: this is used only when `unrestricted guest` is disabled |
| const BIOS_MAX_SIZE: u64 = 0x1000000; |
| |
| pub const KERNEL_START_OFFSET: u64 = 0x20_0000; |
| const CMDLINE_OFFSET: u64 = 0x2_0000; |
| const CMDLINE_MAX_SIZE: u64 = 0x800; // including terminating zero |
| const SETUP_DATA_START: u64 = CMDLINE_OFFSET + CMDLINE_MAX_SIZE; |
| const SETUP_DATA_END: u64 = MPTABLE_RANGE.start; |
| const X86_64_FDT_MAX_SIZE: u64 = 0x4000; |
| const X86_64_SERIAL_1_3_IRQ: u32 = 4; |
| const X86_64_SERIAL_2_4_IRQ: u32 = 3; |
| // X86_64_SCI_IRQ is used to fill the ACPI FACP table. |
| // The sci_irq number is better to be a legacy |
| // IRQ number which is less than 16(actually most of the |
| // platforms have fixed IRQ number 9). So we can |
| // reserve the IRQ number 5 for SCI and let the |
| // the other devices starts from next. |
| pub const X86_64_SCI_IRQ: u32 = 5; |
| // The CMOS RTC uses IRQ 8; start allocating IRQs at 9. |
| pub const X86_64_IRQ_BASE: u32 = 9; |
| const ACPI_HI_RSDP_WINDOW_BASE: u64 = 0x000E_0000; |
| |
| // pVM firmware memory. Should be within the low 4GB, so that it is identity-mapped |
| // by setup_page_tables() when a protected VM boots in long mode, since the pVM firmware is |
| // the VM entry point. |
| const PROTECTED_VM_FW_MAX_SIZE: u64 = 0x40_0000; |
| // Load the pVM firmware just below 2 GB to allow use of `-mcmodel=small`. |
| const PROTECTED_VM_FW_START: u64 = 0x8000_0000 - PROTECTED_VM_FW_MAX_SIZE; |
| |
| #[derive(Debug, PartialEq, Eq)] |
| pub enum CpuManufacturer { |
| Intel, |
| Amd, |
| Unknown, |
| } |
| |
| pub fn get_cpu_manufacturer() -> CpuManufacturer { |
| cpuid::cpu_manufacturer() |
| } |
| |
| pub struct ArchMemoryLayout { |
| // the pci mmio range below 4G |
| pci_mmio_before_32bit: AddressRange, |
| // the pcie cfg mmio range |
| pcie_cfg_mmio: AddressRange, |
| // the pVM firmware memory (if running a protected VM) |
| pvmfw_mem: Option<AddressRange>, |
| } |
| |
| pub fn create_arch_memory_layout( |
| pci_config: &PciConfig, |
| has_protected_vm_firmware: bool, |
| ) -> Result<ArchMemoryLayout> { |
| // the max bus number is 256 and each bus occupy 1MB, so the max pcie cfg mmio size = 256M |
| const MAX_PCIE_ECAM_SIZE: u64 = 256 * MB; |
| let pcie_cfg_mmio = match pci_config.ecam { |
| Some(MemoryRegionConfig { |
| start, |
| size: Some(size), |
| }) => AddressRange::from_start_and_size(start, size.min(MAX_PCIE_ECAM_SIZE)).unwrap(), |
| Some(MemoryRegionConfig { start, size: None }) => { |
| AddressRange::from_start_and_end(start, DEFAULT_PCIE_CFG_MMIO_END) |
| } |
| None => { |
| AddressRange::from_start_and_end(DEFAULT_PCIE_CFG_MMIO_START, DEFAULT_PCIE_CFG_MMIO_END) |
| } |
| }; |
| if pcie_cfg_mmio.start % pcie_cfg_mmio.len().unwrap() != 0 |
| || pcie_cfg_mmio.start % MB != 0 |
| || pcie_cfg_mmio.len().unwrap() % MB != 0 |
| { |
| return Err(Error::ConfigurePciEcam( |
| "base and len must be aligned to 1MB and base must be a multiple of len".to_string(), |
| )); |
| } |
| if pcie_cfg_mmio.end >= 0x1_0000_0000 { |
| return Err(Error::ConfigurePciEcam( |
| "end address can't go beyond 4G".to_string(), |
| )); |
| } |
| |
| let pci_mmio_before_32bit = match pci_config.mem { |
| Some(MemoryRegionConfig { |
| start, |
| size: Some(size), |
| }) => AddressRange::from_start_and_size(start, size) |
| .ok_or(Error::ConfigurePciMem("region overflowed".to_string()))?, |
| Some(MemoryRegionConfig { start, size: None }) => { |
| AddressRange::from_start_and_end(start, DEFAULT_PCI_MEM_END) |
| } |
| None => AddressRange::from_start_and_end( |
| pcie_cfg_mmio |
| .start |
| .min(FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE), |
| DEFAULT_PCI_MEM_END, |
| ), |
| }; |
| |
| let pvmfw_mem = if has_protected_vm_firmware { |
| let range = AddressRange { |
| start: PROTECTED_VM_FW_START, |
| end: PROTECTED_VM_FW_START + PROTECTED_VM_FW_MAX_SIZE - 1, |
| }; |
| if !pci_mmio_before_32bit.intersect(range).is_empty() { |
| return Err(Error::PciMmioOverlapPvmFw); |
| } |
| |
| Some(range) |
| } else { |
| None |
| }; |
| |
| Ok(ArchMemoryLayout { |
| pci_mmio_before_32bit, |
| pcie_cfg_mmio, |
| pvmfw_mem, |
| }) |
| } |
| |
| /// The x86 reset vector for i386+ and x86_64 puts the processor into an "unreal mode" where it |
| /// can access the last 1 MB of the 32-bit address space in 16-bit mode, and starts the instruction |
| /// pointer at the effective physical address 0xFFFF_FFF0. |
| fn bios_start(bios_size: u64) -> GuestAddress { |
| GuestAddress(FIRST_ADDR_PAST_32BITS - bios_size) |
| } |
| |
| fn identity_map_addr_start() -> GuestAddress { |
| // Set Identity map address 4 pages before the max BIOS size |
| GuestAddress(FIRST_ADDR_PAST_32BITS - BIOS_MAX_SIZE - 4 * 0x1000) |
| } |
| |
| fn tss_addr_start() -> GuestAddress { |
| // Set TSS address one page after identity map address |
| GuestAddress(identity_map_addr_start().offset() + 0x1000) |
| } |
| |
| fn tss_addr_end() -> GuestAddress { |
| // Set TSS address section to have 3 pages |
| GuestAddress(tss_addr_start().offset() + 0x3000) |
| } |
| |
| fn configure_boot_params( |
| guest_mem: &GuestMemory, |
| cmdline_addr: GuestAddress, |
| setup_data: Option<GuestAddress>, |
| initrd: Option<(GuestAddress, u32)>, |
| mut params: boot_params, |
| e820_entries: &[E820Entry], |
| ) -> Result<()> { |
| const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55; |
| const KERNEL_HDR_MAGIC: u32 = 0x5372_6448; |
| const KERNEL_LOADER_OTHER: u8 = 0xff; |
| const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x100_0000; // Must be non-zero. |
| |
| params.hdr.type_of_loader = KERNEL_LOADER_OTHER; |
| params.hdr.boot_flag = KERNEL_BOOT_FLAG_MAGIC; |
| params.hdr.header = KERNEL_HDR_MAGIC; |
| params.hdr.cmd_line_ptr = cmdline_addr.offset() as u32; |
| params.ext_cmd_line_ptr = (cmdline_addr.offset() >> 32) as u32; |
| params.hdr.kernel_alignment = KERNEL_MIN_ALIGNMENT_BYTES; |
| if let Some(setup_data) = setup_data { |
| params.hdr.setup_data = setup_data.offset(); |
| } |
| if let Some((initrd_addr, initrd_size)) = initrd { |
| params.hdr.ramdisk_image = initrd_addr.offset() as u32; |
| params.ext_ramdisk_image = (initrd_addr.offset() >> 32) as u32; |
| params.hdr.ramdisk_size = initrd_size; |
| params.ext_ramdisk_size = 0; |
| } |
| |
| if e820_entries.len() >= params.e820_table.len() { |
| return Err(Error::E820Configuration); |
| } |
| |
| for (src, dst) in e820_entries.iter().zip(params.e820_table.iter_mut()) { |
| dst.addr = src.address.offset(); |
| dst.size = src.len; |
| dst.type_ = src.mem_type as u32; |
| } |
| params.e820_entries = e820_entries.len() as u8; |
| |
| let zero_page_addr = GuestAddress(ZERO_PAGE_OFFSET); |
| if !guest_mem.is_valid_range(zero_page_addr, mem::size_of::<boot_params>() as u64) { |
| return Err(Error::ZeroPagePastRamEnd); |
| } |
| |
| guest_mem |
| .write_obj_at_addr(params, zero_page_addr) |
| .map_err(|_| Error::ZeroPageSetup)?; |
| |
| Ok(()) |
| } |
| |
| fn configure_multiboot_info( |
| guest_mem: &GuestMemory, |
| cmdline_addr: GuestAddress, |
| e820_entries: &[E820Entry], |
| ) -> Result<()> { |
| let mut multiboot_info = MultibootInfo { |
| ..Default::default() |
| }; |
| |
| // Extra Multiboot-related data is added directly after the info structure. |
| let mut multiboot_data_addr = |
| GuestAddress(MULTIBOOT_INFO_OFFSET + mem::size_of_val(&multiboot_info) as u64); |
| multiboot_data_addr = multiboot_data_addr |
| .align(16) |
| .ok_or(Error::MultibootInfoSetup)?; |
| |
| // mem_lower is the amount of RAM below 1 MB, in units of KiB. |
| let mem_lower = guest_mem |
| .regions() |
| .filter(|r| { |
| r.options.purpose == MemoryRegionPurpose::GuestMemoryRegion |
| && r.guest_addr.offset() < 1 * MB |
| }) |
| .map(|r| r.size as u64) |
| .sum::<u64>() |
| / KB; |
| |
| // mem_upper is the amount of RAM above 1 MB up to the first memory hole, in units of KiB. |
| // We don't have the ISA 15-16 MB hole, so this includes all RAM from 1 MB up to the |
| // beginning of the PCI hole just below 4 GB. |
| let mem_upper = guest_mem |
| .regions() |
| .filter(|r| { |
| r.options.purpose == MemoryRegionPurpose::GuestMemoryRegion |
| && r.guest_addr.offset() >= 1 * MB |
| && r.guest_addr.offset() < 4 * GB |
| }) |
| .map(|r| r.size as u64) |
| .sum::<u64>() |
| / KB; |
| |
| multiboot_info.mem_lower = mem_lower as u32; |
| multiboot_info.mem_upper = mem_upper as u32; |
| multiboot_info.flags |= MultibootInfo::F_MEM; |
| |
| // Memory map - convert from params.e820_table to Multiboot format. |
| let multiboot_mmap: Vec<MultibootMmapEntry> = e820_entries |
| .iter() |
| .map(|e820_entry| MultibootMmapEntry { |
| size: 20, // size of the entry, not including the size field itself |
| base_addr: e820_entry.address.offset(), |
| length: e820_entry.len, |
| type_: e820_entry.mem_type as u32, |
| }) |
| .collect(); |
| let multiboot_mmap_bytes = multiboot_mmap.as_bytes(); |
| let multiboot_mmap_addr = |
| append_multiboot_info(guest_mem, &mut multiboot_data_addr, multiboot_mmap_bytes)?; |
| multiboot_info.mmap_addr = multiboot_mmap_addr.offset() as u32; |
| multiboot_info.mmap_length = multiboot_mmap_bytes.len() as u32; |
| multiboot_info.flags |= MultibootInfo::F_MMAP; |
| |
| // Command line |
| multiboot_info.cmdline = cmdline_addr.offset() as u32; |
| multiboot_info.flags |= MultibootInfo::F_CMDLINE; |
| |
| // Boot loader name |
| let boot_loader_name_addr = |
| append_multiboot_info(guest_mem, &mut multiboot_data_addr, b"crosvm\0")?; |
| multiboot_info.boot_loader_name = boot_loader_name_addr.offset() as u32; |
| multiboot_info.flags |= MultibootInfo::F_BOOT_LOADER_NAME; |
| |
| guest_mem |
| .write_obj_at_addr(multiboot_info, GuestAddress(MULTIBOOT_INFO_OFFSET)) |
| .map_err(|_| Error::MultibootInfoSetup)?; |
| |
| Ok(()) |
| } |
| |
| fn append_multiboot_info( |
| guest_mem: &GuestMemory, |
| addr: &mut GuestAddress, |
| data: &[u8], |
| ) -> Result<GuestAddress> { |
| let data_addr = *addr; |
| let new_addr = addr |
| .checked_add(data.len() as u64) |
| .and_then(|a| a.align(16)) |
| .ok_or(Error::MultibootInfoSetup)?; |
| |
| // Make sure we don't write beyond the region reserved for Multiboot info. |
| if new_addr.offset() - MULTIBOOT_INFO_OFFSET > MULTIBOOT_INFO_SIZE { |
| return Err(Error::MultibootInfoSetup); |
| } |
| |
| guest_mem |
| .write_all_at_addr(data, data_addr) |
| .map_err(|_| Error::MultibootInfoSetup)?; |
| |
| *addr = new_addr; |
| Ok(data_addr) |
| } |
| |
| /// Write setup_data entries in guest memory and link them together with the `next` field. |
| /// |
| /// Returns the guest address of the first entry in the setup_data list, if any. |
| fn write_setup_data( |
| guest_mem: &GuestMemory, |
| setup_data_start: GuestAddress, |
| setup_data_end: GuestAddress, |
| setup_data: &[SetupData], |
| ) -> Result<Option<GuestAddress>> { |
| let mut setup_data_list_head = None; |
| |
| // Place the first setup_data at the first 64-bit aligned offset following setup_data_start. |
| let mut setup_data_addr = setup_data_start.align(8).ok_or(Error::SetupDataTooLarge)?; |
| |
| let mut entry_iter = setup_data.iter().peekable(); |
| while let Some(entry) = entry_iter.next() { |
| if setup_data_list_head.is_none() { |
| setup_data_list_head = Some(setup_data_addr); |
| } |
| |
| // Ensure the entry (header plus data) fits into guest memory. |
| let entry_size = (mem::size_of::<setup_data_hdr>() + entry.data.len()) as u64; |
| let entry_end = setup_data_addr |
| .checked_add(entry_size) |
| .ok_or(Error::SetupDataTooLarge)?; |
| |
| if entry_end >= setup_data_end { |
| return Err(Error::SetupDataTooLarge); |
| } |
| |
| let next_setup_data_addr = if entry_iter.peek().is_some() { |
| // Place the next setup_data at a 64-bit aligned address. |
| setup_data_addr |
| .checked_add(entry_size) |
| .and_then(|addr| addr.align(8)) |
| .ok_or(Error::SetupDataTooLarge)? |
| } else { |
| // This is the final entry. Terminate the list with next == 0. |
| GuestAddress(0) |
| }; |
| |
| let hdr = setup_data_hdr { |
| next: next_setup_data_addr.offset(), |
| type_: entry.type_ as u32, |
| len: entry |
| .data |
| .len() |
| .try_into() |
| .map_err(|_| Error::SetupDataTooLarge)?, |
| }; |
| |
| guest_mem |
| .write_obj_at_addr(hdr, setup_data_addr) |
| .map_err(Error::WritingSetupData)?; |
| guest_mem |
| .write_all_at_addr( |
| &entry.data, |
| setup_data_addr.unchecked_add(mem::size_of::<setup_data_hdr>() as u64), |
| ) |
| .map_err(Error::WritingSetupData)?; |
| |
| setup_data_addr = next_setup_data_addr; |
| } |
| |
| Ok(setup_data_list_head) |
| } |
| |
| /// Find the first `setup_data_hdr` with the given type in guest memory and return its address. |
| fn find_setup_data( |
| mem: &GuestMemory, |
| setup_data_start: GuestAddress, |
| setup_data_end: GuestAddress, |
| type_: SetupDataType, |
| ) -> Option<GuestAddress> { |
| let mut setup_data_addr = setup_data_start.align(8)?; |
| while setup_data_addr < setup_data_end { |
| let hdr: setup_data_hdr = mem.read_obj_from_addr(setup_data_addr).ok()?; |
| if hdr.type_ == type_ as u32 { |
| return Some(setup_data_addr); |
| } |
| |
| if hdr.next == 0 { |
| return None; |
| } |
| |
| setup_data_addr = GuestAddress(hdr.next); |
| } |
| None |
| } |
| |
| /// Generate a SETUP_RNG_SEED SetupData with random seed data. |
| fn setup_data_rng_seed() -> SetupData { |
| let data: [u8; 256] = rand::random(); |
| SetupData { |
| data: data.to_vec(), |
| type_: SetupDataType::RngSeed, |
| } |
| } |
| |
| /// Add an e820 region to the e820 map. |
| fn add_e820_entry( |
| e820_entries: &mut Vec<E820Entry>, |
| range: AddressRange, |
| mem_type: E820Type, |
| ) -> Result<()> { |
| e820_entries.push(E820Entry { |
| address: GuestAddress(range.start), |
| len: range.len().ok_or(Error::E820Configuration)?, |
| mem_type, |
| }); |
| |
| Ok(()) |
| } |
| |
| /// Generate a memory map in INT 0x15 AX=0xE820 format. |
| fn generate_e820_memory_map( |
| arch_memory_layout: &ArchMemoryLayout, |
| guest_mem: &GuestMemory, |
| ) -> Result<Vec<E820Entry>> { |
| let mut e820_entries = Vec::new(); |
| |
| for r in guest_mem.regions() { |
| let range = AddressRange::from_start_and_size(r.guest_addr.offset(), r.size as u64) |
| .expect("invalid guest mem region"); |
| let mem_type = match r.options.purpose { |
| MemoryRegionPurpose::Bios => E820Type::Reserved, |
| MemoryRegionPurpose::GuestMemoryRegion => E820Type::Ram, |
| // After the pVM firmware jumped to the guest, the pVM firmware itself is no longer |
| // running, so its memory is reusable by the guest OS. So add this memory as RAM rather |
| // than Reserved. |
| MemoryRegionPurpose::ProtectedFirmwareRegion => E820Type::Ram, |
| MemoryRegionPurpose::ReservedMemory => E820Type::Reserved, |
| }; |
| add_e820_entry(&mut e820_entries, range, mem_type)?; |
| } |
| |
| let pcie_cfg_mmio_range = arch_memory_layout.pcie_cfg_mmio; |
| add_e820_entry(&mut e820_entries, pcie_cfg_mmio_range, E820Type::Reserved)?; |
| |
| add_e820_entry( |
| &mut e820_entries, |
| X8664arch::get_pcie_vcfg_mmio_range(guest_mem, &pcie_cfg_mmio_range), |
| E820Type::Reserved, |
| )?; |
| |
| // Reserve memory section for Identity map and TSS |
| add_e820_entry( |
| &mut e820_entries, |
| AddressRange { |
| start: identity_map_addr_start().offset(), |
| end: tss_addr_end().offset() - 1, |
| }, |
| E820Type::Reserved, |
| )?; |
| |
| Ok(e820_entries) |
| } |
| |
| /// Returns a Vec of the valid memory addresses. |
| /// These should be used to configure the GuestMemory structure for the platform. |
| /// For x86_64 all addresses are valid from the start of the kernel except a |
| /// carve out at the end of 32bit address space. |
| pub fn arch_memory_regions( |
| arch_memory_layout: &ArchMemoryLayout, |
| mem_size: u64, |
| bios_size: Option<u64>, |
| ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)> { |
| let mut regions = Vec::new(); |
| |
| // Some guest kernels expect a typical PC memory layout where the region between 640 KB and |
| // 1 MB is reserved for device memory/ROMs and get confused if there is a RAM region |
| // spanning this area, so we provide the traditional 640 KB low memory and 1 MB+ |
| // high memory regions. |
| let mem_below_1m = 640 * KB; |
| regions.push(( |
| GuestAddress(0), |
| mem_below_1m, |
| MemoryRegionOptions::new().purpose(MemoryRegionPurpose::GuestMemoryRegion), |
| )); |
| |
| // Reserved/BIOS data area between 640 KB and 1 MB. |
| // This needs to be backed by an actual GuestMemory region so we can write BIOS tables here, but |
| // it should be reported as "reserved" in the e820 memory map to match PC architecture |
| // expectations. |
| regions.push(( |
| GuestAddress(640 * KB), |
| (1 * MB) - (640 * KB), |
| MemoryRegionOptions::new().purpose(MemoryRegionPurpose::ReservedMemory), |
| )); |
| |
| // RAM between 1 MB and 4 GB |
| let mem_1m_to_4g = arch_memory_layout.pci_mmio_before_32bit.start.min(mem_size) - 1 * MB; |
| regions.push(( |
| GuestAddress(1 * MB), |
| mem_1m_to_4g, |
| MemoryRegionOptions::new().purpose(MemoryRegionPurpose::GuestMemoryRegion), |
| )); |
| |
| // RAM above 4 GB |
| let mem_above_4g = mem_size.saturating_sub(1 * MB + mem_1m_to_4g); |
| if mem_above_4g > 0 { |
| regions.push(( |
| GuestAddress(FIRST_ADDR_PAST_32BITS), |
| mem_above_4g, |
| MemoryRegionOptions::new().purpose(MemoryRegionPurpose::GuestMemoryRegion), |
| )); |
| } |
| |
| if let Some(bios_size) = bios_size { |
| regions.push(( |
| bios_start(bios_size), |
| bios_size, |
| MemoryRegionOptions::new().purpose(MemoryRegionPurpose::Bios), |
| )); |
| } |
| |
| if let Some(pvmfw_mem) = arch_memory_layout.pvmfw_mem { |
| // Remove any areas of guest memory regions that overlap the pVM firmware range. |
| while let Some(overlapping_region_index) = regions.iter().position(|(addr, size, _opts)| { |
| let region_addr_range = AddressRange::from_start_and_size(addr.offset(), *size) |
| .expect("invalid GuestMemory range"); |
| region_addr_range.overlaps(pvmfw_mem) |
| }) { |
| let overlapping_region = regions.swap_remove(overlapping_region_index); |
| let overlapping_region_range = AddressRange::from_start_and_size( |
| overlapping_region.0.offset(), |
| overlapping_region.1, |
| ) |
| .unwrap(); |
| let (first, second) = overlapping_region_range.non_overlapping_ranges(pvmfw_mem); |
| if !first.is_empty() { |
| regions.push(( |
| GuestAddress(first.start), |
| first.len().unwrap(), |
| overlapping_region.2.clone(), |
| )); |
| } |
| if !second.is_empty() { |
| regions.push(( |
| GuestAddress(second.start), |
| second.len().unwrap(), |
| overlapping_region.2, |
| )); |
| } |
| } |
| |
| // Insert a region for the pVM firmware area. |
| regions.push(( |
| GuestAddress(pvmfw_mem.start), |
| pvmfw_mem.len().expect("invalid pvmfw region"), |
| MemoryRegionOptions::new().purpose(MemoryRegionPurpose::ProtectedFirmwareRegion), |
| )); |
| } |
| |
| regions.sort_unstable_by_key(|(addr, _, _)| *addr); |
| |
| for (addr, size, options) in ®ions { |
| debug!( |
| "{:#018x}-{:#018x} {:?}", |
| addr.offset(), |
| addr.offset() + size - 1, |
| options.purpose, |
| ); |
| } |
| |
| regions |
| } |
| |
| impl arch::LinuxArch for X8664arch { |
| type Error = Error; |
| type ArchMemoryLayout = ArchMemoryLayout; |
| |
| fn arch_memory_layout( |
| components: &VmComponents, |
| ) -> std::result::Result<Self::ArchMemoryLayout, Self::Error> { |
| create_arch_memory_layout( |
| &components.pci_config, |
| components.hv_cfg.protection_type.runs_firmware(), |
| ) |
| } |
| |
| fn guest_memory_layout( |
| components: &VmComponents, |
| arch_memory_layout: &Self::ArchMemoryLayout, |
| _hypervisor: &impl Hypervisor, |
| ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error> { |
| let bios_size = match &components.vm_image { |
| VmImage::Bios(bios_file) => Some(bios_file.metadata().map_err(Error::LoadBios)?.len()), |
| VmImage::Kernel(_) => None, |
| }; |
| |
| Ok(arch_memory_regions( |
| arch_memory_layout, |
| components.memory_size, |
| bios_size, |
| )) |
| } |
| |
| fn get_system_allocator_config<V: Vm>( |
| vm: &V, |
| arch_memory_layout: &Self::ArchMemoryLayout, |
| ) -> SystemAllocatorConfig { |
| SystemAllocatorConfig { |
| io: Some(AddressRange { |
| start: 0xc000, |
| end: 0xffff, |
| }), |
| low_mmio: arch_memory_layout.pci_mmio_before_32bit, |
| high_mmio: Self::get_high_mmio_range(vm, arch_memory_layout), |
| platform_mmio: None, |
| first_irq: X86_64_IRQ_BASE, |
| } |
| } |
| |
| fn build_vm<V, Vcpu>( |
| mut components: VmComponents, |
| arch_memory_layout: &Self::ArchMemoryLayout, |
| vm_evt_wrtube: &SendTube, |
| system_allocator: &mut SystemAllocator, |
| serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, |
| serial_jail: Option<Minijail>, |
| battery: (Option<BatteryType>, Option<Minijail>), |
| mut vm: V, |
| ramoops_region: Option<arch::pstore::RamoopsRegion>, |
| devs: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, |
| irq_chip: &mut dyn IrqChipX86_64, |
| vcpu_ids: &mut Vec<usize>, |
| dump_device_tree_blob: Option<PathBuf>, |
| debugcon_jail: Option<Minijail>, |
| pflash_jail: Option<Minijail>, |
| fw_cfg_jail: Option<Minijail>, |
| #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, |
| guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>, |
| device_tree_overlays: Vec<DtbOverlay>, |
| _fdt_position: Option<FdtPosition>, |
| _no_pmu: bool, |
| ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error> |
| where |
| V: VmX86_64, |
| Vcpu: VcpuX86_64, |
| { |
| let mem = vm.get_memory().clone(); |
| |
| let vcpu_count = components.vcpu_count; |
| |
| vm.set_identity_map_addr(identity_map_addr_start()) |
| .map_err(Error::SetIdentityMapAddr)?; |
| |
| vm.set_tss_addr(tss_addr_start()) |
| .map_err(Error::SetTssAddr)?; |
| |
| // Use IRQ info in ACPI if provided by the user. |
| let mut mptable = true; |
| let mut sci_irq = X86_64_SCI_IRQ; |
| |
| // punch pcie config mmio from pci low mmio, so that it couldn't be |
| // allocated to any device. |
| let pcie_cfg_mmio_range = arch_memory_layout.pcie_cfg_mmio; |
| system_allocator |
| .reserve_mmio(pcie_cfg_mmio_range) |
| .map_err(Error::ReservePcieCfgMmio)?; |
| |
| for sdt in components.acpi_sdts.iter() { |
| if sdt.is_signature(b"FACP") { |
| mptable = false; |
| let sci_irq_fadt: u16 = sdt.read(acpi::FADT_FIELD_SCI_INTERRUPT); |
| sci_irq = sci_irq_fadt.into(); |
| if !system_allocator.reserve_irq(sci_irq) { |
| warn!("sci irq {} already reserved.", sci_irq); |
| } |
| } |
| } |
| |
| let pcie_vcfg_range = Self::get_pcie_vcfg_mmio_range(&mem, &pcie_cfg_mmio_range); |
| let mmio_bus = Arc::new(Bus::new(BusType::Mmio)); |
| let io_bus = Arc::new(Bus::new(BusType::Io)); |
| let hypercall_bus = Arc::new(Bus::new(BusType::Hypercall)); |
| |
| let (pci_devices, _devs): (Vec<_>, Vec<_>) = devs |
| .into_iter() |
| .partition(|(dev, _)| dev.as_pci_device().is_some()); |
| |
| let pci_devices = pci_devices |
| .into_iter() |
| .map(|(dev, jail_orig)| (dev.into_pci_device().unwrap(), jail_orig)) |
| .collect(); |
| |
| let (pci, pci_irqs, pid_debug_label_map, amls, gpe_scope_amls) = arch::generate_pci_root( |
| pci_devices, |
| irq_chip.as_irq_chip_mut(), |
| mmio_bus.clone(), |
| GuestAddress(pcie_cfg_mmio_range.start), |
| 12, |
| io_bus.clone(), |
| system_allocator, |
| &mut vm, |
| 4, // Share the four pin interrupts (INTx#) |
| Some(pcie_vcfg_range.start), |
| #[cfg(feature = "swap")] |
| swap_controller, |
| ) |
| .map_err(Error::CreatePciRoot)?; |
| |
| let pci = Arc::new(Mutex::new(pci)); |
| pci.lock().enable_pcie_cfg_mmio(pcie_cfg_mmio_range.start); |
| let pci_cfg = PciConfigIo::new( |
| pci.clone(), |
| components.break_linux_pci_config_io, |
| vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?, |
| ); |
| let pci_bus = Arc::new(Mutex::new(pci_cfg)); |
| io_bus.insert(pci_bus, 0xcf8, 0x8).unwrap(); |
| |
| let pcie_cfg_mmio = Arc::new(Mutex::new(PciConfigMmio::new(pci.clone(), 12))); |
| let pcie_cfg_mmio_len = pcie_cfg_mmio_range.len().unwrap(); |
| mmio_bus |
| .insert(pcie_cfg_mmio, pcie_cfg_mmio_range.start, pcie_cfg_mmio_len) |
| .unwrap(); |
| |
| let pcie_vcfg_mmio = Arc::new(Mutex::new(PciVirtualConfigMmio::new(pci.clone(), 13))); |
| mmio_bus |
| .insert( |
| pcie_vcfg_mmio, |
| pcie_vcfg_range.start, |
| pcie_vcfg_range.len().unwrap(), |
| ) |
| .unwrap(); |
| |
| // Event used to notify crosvm that guest OS is trying to suspend. |
| let (suspend_tube_send, suspend_tube_recv) = |
| Tube::directional_pair().map_err(Error::CreateTube)?; |
| let suspend_tube_send = Arc::new(Mutex::new(suspend_tube_send)); |
| |
| if components.fw_cfg_enable { |
| Self::setup_fw_cfg_device( |
| &io_bus, |
| components.fw_cfg_parameters.clone(), |
| components.bootorder_fw_cfg_blob.clone(), |
| fw_cfg_jail, |
| #[cfg(feature = "swap")] |
| swap_controller, |
| )?; |
| } |
| |
| if !components.no_i8042 { |
| Self::setup_legacy_i8042_device( |
| &io_bus, |
| irq_chip.pit_uses_speaker_port(), |
| vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?, |
| )?; |
| } |
| let mut vm_request_tube = if !components.no_rtc { |
| let (host_tube, device_tube) = Tube::pair() |
| .context("create tube") |
| .map_err(Error::SetupCmos)?; |
| Self::setup_legacy_cmos_device( |
| arch_memory_layout, |
| &io_bus, |
| irq_chip, |
| device_tube, |
| components.memory_size, |
| ) |
| .map_err(Error::SetupCmos)?; |
| Some(host_tube) |
| } else { |
| None |
| }; |
| let serial_devices = Self::setup_serial_devices( |
| components.hv_cfg.protection_type, |
| irq_chip.as_irq_chip_mut(), |
| &io_bus, |
| serial_parameters, |
| serial_jail, |
| #[cfg(feature = "swap")] |
| swap_controller, |
| )?; |
| Self::setup_debugcon_devices( |
| components.hv_cfg.protection_type, |
| &io_bus, |
| serial_parameters, |
| debugcon_jail, |
| #[cfg(feature = "swap")] |
| swap_controller, |
| )?; |
| |
| let bios_size = if let VmImage::Bios(ref bios) = components.vm_image { |
| bios.metadata().map_err(Error::LoadBios)?.len() |
| } else { |
| 0 |
| }; |
| if let Some(pflash_image) = components.pflash_image { |
| Self::setup_pflash( |
| pflash_image, |
| components.pflash_block_size, |
| bios_size, |
| &mmio_bus, |
| pflash_jail, |
| #[cfg(feature = "swap")] |
| swap_controller, |
| )?; |
| } |
| |
| // Functions that use/create jails MUST be used before the call to |
| // setup_acpi_devices below, as this move us into a multiprocessing state |
| // from which we can no longer fork. |
| |
| let mut resume_notify_devices = Vec::new(); |
| |
| // each bus occupy 1MB mmio for pcie enhanced configuration |
| let max_bus = (pcie_cfg_mmio_len / 0x100000 - 1) as u8; |
| let (mut acpi_dev_resource, bat_control) = Self::setup_acpi_devices( |
| arch_memory_layout, |
| pci.clone(), |
| &mem, |
| &io_bus, |
| system_allocator, |
| suspend_tube_send.clone(), |
| vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?, |
| components.acpi_sdts, |
| irq_chip.as_irq_chip_mut(), |
| sci_irq, |
| battery, |
| &mmio_bus, |
| max_bus, |
| &mut resume_notify_devices, |
| #[cfg(feature = "swap")] |
| swap_controller, |
| #[cfg(any(target_os = "android", target_os = "linux"))] |
| components.ac_adapter, |
| guest_suspended_cvar, |
| &pci_irqs, |
| )?; |
| |
| // Create customized SSDT table |
| let sdt = acpi::create_customize_ssdt(pci.clone(), amls, gpe_scope_amls); |
| if let Some(sdt) = sdt { |
| acpi_dev_resource.sdts.push(sdt); |
| } |
| |
| irq_chip |
| .finalize_devices(system_allocator, &io_bus, &mmio_bus) |
| .map_err(Error::RegisterIrqfd)?; |
| |
| // All of these bios generated tables are set manually for the benefit of the kernel boot |
| // flow (since there's no BIOS to set it) and for the BIOS boot flow since crosvm doesn't |
| // have a way to pass the BIOS these configs. |
| // This works right now because the only guest BIOS used with crosvm (u-boot) ignores these |
| // tables and the guest OS picks them up. |
| // If another guest does need a way to pass these tables down to it's BIOS, this approach |
| // should be rethought. |
| |
| // Make sure the `vcpu_count` casts below and the arithmetic in `setup_mptable` are well |
| // defined. |
| if vcpu_count >= u8::MAX.into() { |
| return Err(Error::TooManyVcpus); |
| } |
| |
| if mptable { |
| mptable::setup_mptable(&mem, vcpu_count as u8, &pci_irqs) |
| .map_err(Error::SetupMptable)?; |
| } |
| smbios::setup_smbios(&mem, &components.smbios, bios_size).map_err(Error::SetupSmbios)?; |
| |
| let host_cpus = if components.host_cpu_topology { |
| components.vcpu_affinity.clone() |
| } else { |
| None |
| }; |
| |
| // TODO (tjeznach) Write RSDP to bootconfig before writing to memory |
| acpi::create_acpi_tables( |
| &mem, |
| vcpu_count as u8, |
| sci_irq, |
| 0xcf9, |
| 6, // RST_CPU|SYS_RST |
| &acpi_dev_resource, |
| host_cpus, |
| vcpu_ids, |
| &pci_irqs, |
| pcie_cfg_mmio_range.start, |
| max_bus, |
| components.force_s2idle, |
| ) |
| .ok_or(Error::CreateAcpi)?; |
| |
| let mut cmdline = Self::get_base_linux_cmdline(); |
| |
| get_serial_cmdline(&mut cmdline, serial_parameters, "io", &serial_devices) |
| .map_err(Error::GetSerialCmdline)?; |
| |
| for param in components.extra_kernel_params { |
| cmdline.insert_str(¶m).map_err(Error::Cmdline)?; |
| } |
| |
| if let Some(ramoops_region) = ramoops_region { |
| arch::pstore::add_ramoops_kernel_cmdline(&mut cmdline, &ramoops_region) |
| .map_err(Error::Cmdline)?; |
| } |
| |
| let pci_start = arch_memory_layout.pci_mmio_before_32bit.start; |
| |
| let mut vcpu_init = vec![VcpuInitX86_64::default(); vcpu_count]; |
| let mut msrs = BTreeMap::new(); |
| |
| let protection_type = components.hv_cfg.protection_type; |
| |
| match components.vm_image { |
| VmImage::Bios(ref mut bios) => { |
| if protection_type.runs_firmware() { |
| return Err(Error::PvmFwBiosUnsupported); |
| } |
| |
| // Allow a bios to hardcode CMDLINE_OFFSET and read the kernel command line from it. |
| Self::load_cmdline( |
| &mem, |
| GuestAddress(CMDLINE_OFFSET), |
| cmdline, |
| CMDLINE_MAX_SIZE as usize - 1, |
| )?; |
| Self::load_bios(&mem, bios)?; |
| regs::set_default_msrs(&mut msrs); |
| // The default values for `Regs` and `Sregs` already set up the reset vector. |
| } |
| VmImage::Kernel(ref mut kernel_image) => { |
| let (params, kernel_region, kernel_entry, mut cpu_mode, kernel_type) = |
| Self::load_kernel(&mem, kernel_image)?; |
| |
| info!("Loaded {} kernel", kernel_type); |
| |
| Self::setup_system_memory( |
| arch_memory_layout, |
| &mem, |
| cmdline, |
| components.initrd_image, |
| components.android_fstab, |
| kernel_region, |
| params, |
| dump_device_tree_blob, |
| device_tree_overlays, |
| protection_type, |
| )?; |
| |
| if protection_type.needs_firmware_loaded() { |
| arch::load_image( |
| &mem, |
| &mut components |
| .pvm_fw |
| .expect("pvmfw must be available if ProtectionType loads it"), |
| GuestAddress(PROTECTED_VM_FW_START), |
| PROTECTED_VM_FW_MAX_SIZE, |
| ) |
| .map_err(Error::LoadCustomPvmFw)?; |
| } else if protection_type.runs_firmware() { |
| // Tell the hypervisor to load the pVM firmware. |
| vm.load_protected_vm_firmware( |
| GuestAddress(PROTECTED_VM_FW_START), |
| PROTECTED_VM_FW_MAX_SIZE, |
| ) |
| .map_err(Error::LoadPvmFw)?; |
| } |
| |
| let entry_addr = if protection_type.needs_firmware_loaded() { |
| Some(PROTECTED_VM_FW_START) |
| } else if protection_type.runs_firmware() { |
| None // Initial RIP value is set by the hypervisor |
| } else { |
| Some(kernel_entry.offset()) |
| }; |
| |
| if let Some(entry) = entry_addr { |
| vcpu_init[0].regs.rip = entry; |
| } |
| |
| match kernel_type { |
| KernelType::BzImage | KernelType::Elf => { |
| // Configure the bootstrap VCPU for the Linux/x86 boot protocol. |
| // <https://www.kernel.org/doc/html/latest/x86/boot.html> |
| vcpu_init[0].regs.rsp = BOOT_STACK_POINTER; |
| vcpu_init[0].regs.rsi = ZERO_PAGE_OFFSET; |
| } |
| KernelType::Multiboot => { |
| // Provide Multiboot-compatible bootloader information. |
| vcpu_init[0].regs.rax = MULTIBOOT_BOOTLOADER_MAGIC.into(); |
| vcpu_init[0].regs.rbx = MULTIBOOT_INFO_OFFSET; |
| } |
| } |
| |
| if protection_type.runs_firmware() { |
| // Pass DTB address to pVM firmware. This is redundant with the DTB entry in the |
| // `setup_data` list, but it allows the pVM firmware to know the location of the |
| // DTB without having the `setup_data` region mapped yet. |
| if let Some(fdt_setup_data_addr) = find_setup_data( |
| &mem, |
| GuestAddress(SETUP_DATA_START), |
| GuestAddress(SETUP_DATA_END), |
| SetupDataType::Dtb, |
| ) { |
| vcpu_init[0].regs.rdx = |
| fdt_setup_data_addr.offset() + size_of::<setup_data_hdr>() as u64; |
| } |
| |
| // Pass pVM payload entry address to pVM firmware. |
| // NOTE: this is only for development purposes. An actual pvmfw |
| // implementation should not use this value and should instead receive |
| // the pVM payload start and size info from crosvm as the DTB properties |
| // /config/kernel-address and /config/kernel-size and determine the offset |
| // of the entry point on its own, not trust crosvm to provide it. |
| vcpu_init[0].regs.rdi = kernel_entry.offset(); |
| |
| // The pVM firmware itself always starts in 32-bit protected mode |
| // with paging disabled, regardless of the type of payload. |
| cpu_mode = CpuMode::FlatProtectedMode; |
| } |
| |
| match cpu_mode { |
| CpuMode::LongMode => { |
| regs::set_long_mode_msrs(&mut msrs); |
| |
| // Set up long mode and enable paging. |
| regs::configure_segments_and_sregs(&mem, &mut vcpu_init[0].sregs) |
| .map_err(Error::ConfigureSegments)?; |
| regs::setup_page_tables(&mem, &mut vcpu_init[0].sregs) |
| .map_err(Error::SetupPageTables)?; |
| } |
| CpuMode::FlatProtectedMode => { |
| regs::set_default_msrs(&mut msrs); |
| |
| // Set up 32-bit protected mode with paging disabled. |
| regs::configure_segments_and_sregs_flat32(&mem, &mut vcpu_init[0].sregs) |
| .map_err(Error::ConfigureSegments)?; |
| } |
| } |
| |
| regs::set_mtrr_msrs(&mut msrs, &vm, pci_start); |
| } |
| } |
| |
| // Initialize MSRs for all VCPUs. |
| for vcpu in vcpu_init.iter_mut() { |
| vcpu.msrs = msrs.clone(); |
| } |
| |
| let mut vm_request_tubes = Vec::new(); |
| if let Some(req_tube) = vm_request_tube.take() { |
| vm_request_tubes.push(req_tube); |
| } |
| |
| Ok(RunnableLinuxVm { |
| vm, |
| vcpu_count, |
| vcpus: None, |
| vcpu_affinity: components.vcpu_affinity, |
| vcpu_init, |
| no_smt: components.no_smt, |
| irq_chip: irq_chip.try_box_clone().map_err(Error::CloneIrqChip)?, |
| hypercall_bus, |
| io_bus, |
| mmio_bus, |
| pid_debug_label_map, |
| suspend_tube: (suspend_tube_send, suspend_tube_recv), |
| resume_notify_devices, |
| rt_cpus: components.rt_cpus, |
| delay_rt: components.delay_rt, |
| bat_control, |
| pm: Some(acpi_dev_resource.pm), |
| root_config: pci, |
| #[cfg(any(target_os = "android", target_os = "linux"))] |
| platform_devices: Vec::new(), |
| hotplug_bus: BTreeMap::new(), |
| devices_thread: None, |
| vm_request_tubes, |
| }) |
| } |
| |
| fn configure_vcpu<V: Vm>( |
| vm: &V, |
| hypervisor: &dyn HypervisorX86_64, |
| irq_chip: &mut dyn IrqChipX86_64, |
| vcpu: &mut dyn VcpuX86_64, |
| vcpu_init: VcpuInitX86_64, |
| vcpu_id: usize, |
| num_cpus: usize, |
| cpu_config: Option<CpuConfigX86_64>, |
| ) -> Result<()> { |
| let cpu_config = match cpu_config { |
| Some(config) => config, |
| None => return Err(Error::InvalidCpuConfig), |
| }; |
| if !vm.check_capability(VmCap::EarlyInitCpuid) { |
| cpuid::setup_cpuid(hypervisor, irq_chip, vcpu, vcpu_id, num_cpus, cpu_config) |
| .map_err(Error::SetupCpuid)?; |
| } |
| |
| vcpu.set_regs(&vcpu_init.regs).map_err(Error::WriteRegs)?; |
| |
| vcpu.set_sregs(&vcpu_init.sregs) |
| .map_err(Error::SetupSregs)?; |
| |
| vcpu.set_fpu(&vcpu_init.fpu).map_err(Error::SetupFpu)?; |
| |
| let vcpu_supported_var_mtrrs = regs::vcpu_supported_variable_mtrrs(vcpu); |
| let num_var_mtrrs = regs::count_variable_mtrrs(&vcpu_init.msrs); |
| let skip_mtrr_msrs = if num_var_mtrrs > vcpu_supported_var_mtrrs { |
| warn!( |
| "Too many variable MTRR entries ({} required, {} supported), |
| please check pci_start addr, guest with pass through device may be very slow", |
| num_var_mtrrs, vcpu_supported_var_mtrrs, |
| ); |
| // Filter out the MTRR entries from the MSR list. |
| true |
| } else { |
| false |
| }; |
| |
| for (msr_index, value) in vcpu_init.msrs.into_iter() { |
| if skip_mtrr_msrs && regs::is_mtrr_msr(msr_index) { |
| continue; |
| } |
| |
| vcpu.set_msr(msr_index, value).map_err(Error::SetupMsrs)?; |
| } |
| |
| interrupts::set_lint(vcpu_id, irq_chip).map_err(Error::SetLint)?; |
| |
| Ok(()) |
| } |
| |
| fn register_pci_device<V: VmX86_64, Vcpu: VcpuX86_64>( |
| linux: &mut RunnableLinuxVm<V, Vcpu>, |
| device: Box<dyn PciDevice>, |
| #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>, |
| resources: &mut SystemAllocator, |
| hp_control_tube: &mpsc::Sender<PciRootCommand>, |
| #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, |
| ) -> Result<PciAddress> { |
| arch::configure_pci_device( |
| linux, |
| device, |
| #[cfg(any(target_os = "android", target_os = "linux"))] |
| minijail, |
| resources, |
| hp_control_tube, |
| #[cfg(feature = "swap")] |
| swap_controller, |
| ) |
| .map_err(Error::ConfigurePciDevice) |
| } |
| |
| fn get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>> { |
| Ok(BTreeMap::new()) |
| } |
| |
| fn get_host_cpu_max_freq_khz() -> Result<BTreeMap<usize, u32>> { |
| Ok(BTreeMap::new()) |
| } |
| |
| fn get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>> { |
| Ok(BTreeMap::new()) |
| } |
| |
| fn get_host_cpu_clusters() -> Result<Vec<CpuSet>> { |
| Ok(Vec::new()) |
| } |
| } |
| |
| // OSC returned status register in CDW1 |
| const OSC_STATUS_UNSUPPORT_UUID: u32 = 0x4; |
| // pci host bridge OSC returned control register in CDW3 |
| #[allow(dead_code)] |
| const PCI_HB_OSC_CONTROL_PCIE_HP: u32 = 0x1; |
| const PCI_HB_OSC_CONTROL_SHPC_HP: u32 = 0x2; |
| #[allow(dead_code)] |
| const PCI_HB_OSC_CONTROL_PCIE_PME: u32 = 0x4; |
| const PCI_HB_OSC_CONTROL_PCIE_AER: u32 = 0x8; |
| #[allow(dead_code)] |
| const PCI_HB_OSC_CONTROL_PCIE_CAP: u32 = 0x10; |
| |
| struct PciRootOSC {} |
| |
| // Method (_OSC, 4, NotSerialized) // _OSC: Operating System Capabilities |
| // { |
| // CreateDWordField (Arg3, Zero, CDW1) // flag and return value |
| // If (Arg0 == ToUUID ("33db4d5b-1ff7-401c-9657-7441c03dd766")) |
| // { |
| // CreateDWordField (Arg3, 8, CDW3) // control field |
| // if ( 0 == (CDW1 & 0x01)) // Query flag ? |
| // { |
| // CDW3 &= !(SHPC_HP | AER) |
| // } |
| // } Else { |
| // CDW1 |= UNSUPPORT_UUID |
| // } |
| // Return (Arg3) |
| // } |
| impl Aml for PciRootOSC { |
| fn to_aml_bytes(&self, aml: &mut Vec<u8>) { |
| let osc_uuid = "33DB4D5B-1FF7-401C-9657-7441C03DD766"; |
| // virtual pcie root port supports hotplug, pme, and pcie cap register, clear all |
| // the other bits. |
| let mask = !(PCI_HB_OSC_CONTROL_SHPC_HP | PCI_HB_OSC_CONTROL_PCIE_AER); |
| aml::Method::new( |
| "_OSC".into(), |
| 4, |
| false, |
| vec![ |
| &aml::CreateDWordField::new( |
| &aml::Name::new_field_name("CDW1"), |
| &aml::Arg(3), |
| &aml::ZERO, |
| ), |
| &aml::If::new( |
| &aml::Equal::new(&aml::Arg(0), &aml::Uuid::new(osc_uuid)), |
| vec![ |
| &aml::CreateDWordField::new( |
| &aml::Name::new_field_name("CDW3"), |
| &aml::Arg(3), |
| &(8_u8), |
| ), |
| &aml::If::new( |
| &aml::Equal::new( |
| &aml::ZERO, |
| &aml::And::new( |
| &aml::ZERO, |
| &aml::Name::new_field_name("CDW1"), |
| &aml::ONE, |
| ), |
| ), |
| vec![&aml::And::new( |
| &aml::Name::new_field_name("CDW3"), |
| &mask, |
| &aml::Name::new_field_name("CDW3"), |
| )], |
| ), |
| ], |
| ), |
| &aml::Else::new(vec![&aml::Or::new( |
| &aml::Name::new_field_name("CDW1"), |
| &OSC_STATUS_UNSUPPORT_UUID, |
| &aml::Name::new_field_name("CDW1"), |
| )]), |
| &aml::Return::new(&aml::Arg(3)), |
| ], |
| ) |
| .to_aml_bytes(aml) |
| } |
| } |
| |
| pub enum CpuMode { |
| /// 32-bit protected mode with paging disabled. |
| FlatProtectedMode, |
| |
| /// 64-bit long mode. |
| LongMode, |
| } |
| |
| #[derive(Copy, Clone, Debug, Eq, PartialEq)] |
| pub enum KernelType { |
| BzImage, |
| Elf, |
| Multiboot, |
| } |
| |
| impl fmt::Display for KernelType { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| match self { |
| KernelType::BzImage => write!(f, "bzImage"), |
| KernelType::Elf => write!(f, "ELF"), |
| KernelType::Multiboot => write!(f, "Multiboot"), |
| } |
| } |
| } |
| |
| impl X8664arch { |
| /// Loads the bios from an open file. |
| /// |
| /// # Arguments |
| /// |
| /// * `mem` - The memory to be used by the guest. |
| /// * `bios_image` - the File object for the specified bios |
| fn load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()> { |
| let bios_image_length = bios_image.get_len().map_err(Error::LoadBios)?; |
| if bios_image_length >= FIRST_ADDR_PAST_32BITS { |
| return Err(Error::LoadBios(io::Error::new( |
| io::ErrorKind::InvalidData, |
| format!( |
| "bios was {bios_image_length} bytes, expected less than {FIRST_ADDR_PAST_32BITS}", |
| ), |
| ))); |
| } |
| |
| let guest_slice = mem |
| .get_slice_at_addr(bios_start(bios_image_length), bios_image_length as usize) |
| .map_err(Error::SetupGuestMemory)?; |
| bios_image |
| .read_exact_at_volatile(guest_slice, 0) |
| .map_err(Error::LoadBios)?; |
| Ok(()) |
| } |
| |
| fn setup_pflash( |
| pflash_image: File, |
| block_size: u32, |
| bios_size: u64, |
| mmio_bus: &Bus, |
| jail: Option<Minijail>, |
| #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, |
| ) -> Result<()> { |
| let size = pflash_image.metadata().map_err(Error::LoadPflash)?.len(); |
| let start = FIRST_ADDR_PAST_32BITS - bios_size - size; |
| let pflash_image = Box::new(pflash_image); |
| |
| #[cfg(any(target_os = "android", target_os = "linux"))] |
| let fds = pflash_image.as_raw_descriptors(); |
| |
| let pflash = Pflash::new(pflash_image, block_size).map_err(Error::SetupPflash)?; |
| let pflash: Arc<Mutex<dyn BusDevice>> = match jail { |
| #[cfg(any(target_os = "android", target_os = "linux"))] |
| Some(jail) => Arc::new(Mutex::new( |
| ProxyDevice::new( |
| pflash, |
| jail, |
| fds, |
| #[cfg(feature = "swap")] |
| swap_controller, |
| ) |
| .map_err(Error::CreateProxyDevice)?, |
| )), |
| #[cfg(windows)] |
| Some(_) => unreachable!(), |
| None => Arc::new(Mutex::new(pflash)), |
| }; |
| mmio_bus |
| .insert(pflash, start, size) |
| .map_err(Error::InsertBus)?; |
| |
| Ok(()) |
| } |
| |
| /// Writes the command line string to the given memory slice. |
| /// |
| /// # Arguments |
| /// |
| /// * `guest_mem` - A u8 slice that will be partially overwritten by the command line. |
| /// * `guest_addr` - The address in `guest_mem` at which to load the command line. |
| /// * `cmdline` - The kernel command line. |
| /// * `kernel_max_cmdline_len` - The maximum command line length (without NUL terminator) |
| /// supported by the kernel. |
| fn load_cmdline( |
| guest_mem: &GuestMemory, |
| guest_addr: GuestAddress, |
| cmdline: kernel_cmdline::Cmdline, |
| kernel_max_cmdline_len: usize, |
| ) -> Result<()> { |
| let mut cmdline_guest_mem_slice = guest_mem |
| .get_slice_at_addr(guest_addr, CMDLINE_MAX_SIZE as usize) |
| .map_err(|_| Error::CommandLineOverflow)?; |
| |
| let mut cmdline_bytes: Vec<u8> = cmdline |
| .into_bytes_with_max_len(kernel_max_cmdline_len) |
| .map_err(Error::Cmdline)?; |
| cmdline_bytes.push(0u8); // Add NUL terminator. |
| |
| cmdline_guest_mem_slice |
| .write_all(&cmdline_bytes) |
| .map_err(|_| Error::CommandLineOverflow)?; |
| |
| Ok(()) |
| } |
| |
| /// Loads the kernel from an open file. |
| /// |
| /// # Arguments |
| /// |
| /// * `mem` - The memory to be used by the guest. |
| /// * `kernel_image` - the File object for the specified kernel. |
| /// |
| /// # Returns |
| /// |
| /// On success, returns the Linux x86_64 boot protocol parameters, the address range containing |
| /// the kernel, the entry point (initial `RIP` value), the initial CPU mode, and the type of |
| /// kernel. |
| fn load_kernel( |
| mem: &GuestMemory, |
| kernel_image: &mut File, |
| ) -> Result<(boot_params, AddressRange, GuestAddress, CpuMode, KernelType)> { |
| let kernel_start = GuestAddress(KERNEL_START_OFFSET); |
| |
| let multiboot = |
| kernel_loader::multiboot_header_from_file(kernel_image).map_err(Error::LoadKernel)?; |
| |
| if let Some(multiboot_load) = multiboot.as_ref().and_then(|m| m.load.as_ref()) { |
| let loaded_kernel = kernel_loader::load_multiboot(mem, kernel_image, multiboot_load) |
| .map_err(Error::LoadKernel)?; |
| |
| let boot_params = boot_params { |
| hdr: setup_header { |
| cmdline_size: CMDLINE_MAX_SIZE as u32 - 1, |
| ..Default::default() |
| }, |
| ..Default::default() |
| }; |
| return Ok(( |
| boot_params, |
| loaded_kernel.address_range, |
| loaded_kernel.entry, |
| CpuMode::FlatProtectedMode, |
| KernelType::Multiboot, |
| )); |
| } |
| |
| match kernel_loader::load_elf(mem, kernel_start, kernel_image, 0) { |
| Ok(loaded_kernel) => { |
| // ELF kernels don't contain a `boot_params` structure, so synthesize a default one. |
| let boot_params = boot_params { |
| hdr: setup_header { |
| cmdline_size: CMDLINE_MAX_SIZE as u32 - 1, |
| ..Default::default() |
| }, |
| ..Default::default() |
| }; |
| Ok(( |
| boot_params, |
| loaded_kernel.address_range, |
| loaded_kernel.entry, |
| match loaded_kernel.class { |
| kernel_loader::ElfClass::ElfClass32 => CpuMode::FlatProtectedMode, |
| kernel_loader::ElfClass::ElfClass64 => CpuMode::LongMode, |
| }, |
| KernelType::Elf, |
| )) |
| } |
| Err(kernel_loader::Error::InvalidMagicNumber) => { |
| // The image failed to parse as ELF, so try to load it as a bzImage. |
| let (boot_params, bzimage_region, bzimage_entry, cpu_mode) = |
| bzimage::load_bzimage(mem, kernel_start, kernel_image) |
| .map_err(Error::LoadBzImage)?; |
| Ok(( |
| boot_params, |
| bzimage_region, |
| bzimage_entry, |
| cpu_mode, |
| KernelType::BzImage, |
| )) |
| } |
| Err(e) => Err(Error::LoadKernel(e)), |
| } |
| } |
| |
| /// Configures the system memory space should be called once per vm before |
| /// starting vcpu threads. |
| /// |
| /// # Arguments |
| /// |
| /// * `mem` - The memory to be used by the guest. |
| /// * `cmdline` - the kernel commandline |
| /// * `initrd_file` - an initial ramdisk image |
| pub fn setup_system_memory( |
| arch_memory_layout: &ArchMemoryLayout, |
| mem: &GuestMemory, |
| cmdline: kernel_cmdline::Cmdline, |
| initrd_file: Option<File>, |
| android_fstab: Option<File>, |
| kernel_region: AddressRange, |
| params: boot_params, |
| dump_device_tree_blob: Option<PathBuf>, |
| device_tree_overlays: Vec<DtbOverlay>, |
| protection_type: ProtectionType, |
| ) -> Result<()> { |
| let e820_entries = generate_e820_memory_map(arch_memory_layout, mem)?; |
| |
| let kernel_max_cmdline_len = if params.hdr.cmdline_size == 0 { |
| // Old kernels have a maximum length of 255 bytes, not including the NUL. |
| 255 |
| } else { |
| params.hdr.cmdline_size as usize |
| }; |
| debug!("kernel_max_cmdline_len={kernel_max_cmdline_len}"); |
| Self::load_cmdline( |
| mem, |
| GuestAddress(CMDLINE_OFFSET), |
| cmdline, |
| kernel_max_cmdline_len, |
| )?; |
| |
| let initrd = match initrd_file { |
| Some(mut initrd_file) => { |
| let initrd_addr_max = if params.hdr.xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G != 0 { |
| u64::MAX |
| } else if params.hdr.initrd_addr_max == 0 { |
| // Default initrd_addr_max for old kernels (see Documentation/x86/boot.txt). |
| 0x37FFFFFF |
| } else { |
| u64::from(params.hdr.initrd_addr_max) |
| }; |
| |
| let (initrd_start, initrd_size) = arch::load_image_high( |
| mem, |
| &mut initrd_file, |
| GuestAddress(kernel_region.end + 1), |
| GuestAddress(initrd_addr_max), |
| Some(|region| { |
| region.options.purpose != MemoryRegionPurpose::ProtectedFirmwareRegion |
| }), |
| base::pagesize() as u64, |
| ) |
| .map_err(Error::LoadInitrd)?; |
| Some((initrd_start, initrd_size)) |
| } |
| None => None, |
| }; |
| |
| let mut setup_data_entries = |
| SetupDataEntries::new(SETUP_DATA_START as usize, SETUP_DATA_END as usize); |
| |
| let setup_data_size = setup_data_entries.insert(setup_data_rng_seed()); |
| |
| // SETUP_DTB should be the last one in SETUP_DATA. |
| // This is to reserve enough space for SETUP_DTB |
| // without exceeding the size of SETUP_DATA area. |
| if android_fstab.is_some() |
| || !device_tree_overlays.is_empty() |
| || protection_type.runs_firmware() |
| { |
| let fdt_max_size = min(X86_64_FDT_MAX_SIZE as usize, setup_data_size); |
| let mut device_tree_blob = fdt::create_fdt( |
| mem, |
| android_fstab, |
| dump_device_tree_blob, |
| device_tree_overlays, |
| kernel_region, |
| initrd, |
| ) |
| .map_err(Error::CreateFdt)?; |
| if device_tree_blob.len() > fdt_max_size { |
| return Err(Error::DTBSizeGreaterThanAllowed); |
| } |
| |
| // Reserve and zero fill dtb memory to maximum allowable size |
| // so that pvmfw could patch and extend the dtb in-place. |
| device_tree_blob.resize(fdt_max_size, 0); |
| |
| setup_data_entries.insert(SetupData { |
| data: device_tree_blob, |
| type_: SetupDataType::Dtb, |
| }); |
| } |
| |
| let setup_data = setup_data_entries.write_setup_data(mem)?; |
| |
| configure_boot_params( |
| mem, |
| GuestAddress(CMDLINE_OFFSET), |
| setup_data, |
| initrd, |
| params, |
| &e820_entries, |
| )?; |
| |
| configure_multiboot_info(mem, GuestAddress(CMDLINE_OFFSET), &e820_entries)?; |
| |
| Ok(()) |
| } |
| |
| fn get_pcie_vcfg_mmio_range(mem: &GuestMemory, pcie_cfg_mmio: &AddressRange) -> AddressRange { |
| // Put PCIe VCFG region at a 2MB boundary after physical memory or 4gb, whichever is |
| // greater. |
| let ram_end_round_2mb = mem.end_addr().offset().next_multiple_of(2 * MB); |
| let start = std::cmp::max(ram_end_round_2mb, 4 * GB); |
| // Each pci device's ECAM size is 4kb and its vcfg size is 8kb |
| let end = start + pcie_cfg_mmio.len().unwrap() * 2 - 1; |
| AddressRange { start, end } |
| } |
| |
| /// Returns the high mmio range |
| fn get_high_mmio_range<V: Vm>(vm: &V, arch_memory_layout: &ArchMemoryLayout) -> AddressRange { |
| let mem = vm.get_memory(); |
| let start = Self::get_pcie_vcfg_mmio_range(mem, &arch_memory_layout.pcie_cfg_mmio).end + 1; |
| |
| let phys_mem_end = (1u64 << vm.get_guest_phys_addr_bits()) - 1; |
| let high_mmio_end = std::cmp::min(phys_mem_end, HIGH_MMIO_MAX_END); |
| |
| AddressRange { |
| start, |
| end: high_mmio_end, |
| } |
| } |
| |
| /// This returns a minimal kernel command for this architecture |
| pub fn get_base_linux_cmdline() -> kernel_cmdline::Cmdline { |
| let mut cmdline = kernel_cmdline::Cmdline::new(); |
| cmdline.insert_str("panic=-1").unwrap(); |
| |
| cmdline |
| } |
| |
| /// Sets up fw_cfg device. |
| /// # Arguments |
| /// |
| /// * `io_bus` - the IO bus object |
| /// * `fw_cfg_parameters` - command-line specified data to add to device. May contain all None |
| /// fields if user did not specify data to add to the device |
| fn setup_fw_cfg_device( |
| io_bus: &Bus, |
| fw_cfg_parameters: Vec<FwCfgParameters>, |
| bootorder_fw_cfg_blob: Vec<u8>, |
| fw_cfg_jail: Option<Minijail>, |
| #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, |
| ) -> Result<()> { |
| let fw_cfg = match devices::FwCfgDevice::new(FW_CFG_MAX_FILE_SLOTS, fw_cfg_parameters) { |
| Ok(mut device) => { |
| // this condition will only be true if the user specified at least one bootindex |
| // option on the command line. If none were specified, bootorder_fw_cfg_blob will |
| // only have a null byte (null terminator) |
| if bootorder_fw_cfg_blob.len() > 1 { |
| // Add boot order file to the device. If the file is not present, firmware may |
| // not be able to boot. |
| if let Err(err) = device.add_file( |
| "bootorder", |
| bootorder_fw_cfg_blob, |
| devices::FwCfgItemType::GenericItem, |
| ) { |
| return Err(Error::CreateFwCfgDevice(err)); |
| } |
| } |
| device |
| } |
| Err(err) => { |
| return Err(Error::CreateFwCfgDevice(err)); |
| } |
| }; |
| |
| let fw_cfg: Arc<Mutex<dyn BusDevice>> = match fw_cfg_jail.as_ref() { |
| #[cfg(any(target_os = "android", target_os = "linux"))] |
| Some(jail) => { |
| let jail_clone = jail.try_clone().map_err(Error::CloneJail)?; |
| #[cfg(feature = "seccomp_trace")] |
| debug!( |
| "seccomp_trace {{\"event\": \"minijail_clone\", \"src_jail_addr\": \"0x{:x}\", \"dst_jail_addr\": \"0x{:x}\"}}", |
| read_jail_addr(jail), |
| read_jail_addr(&jail_clone) |
| ); |
| Arc::new(Mutex::new( |
| ProxyDevice::new( |
| fw_cfg, |
| jail_clone, |
| Vec::new(), |
| #[cfg(feature = "swap")] |
| swap_controller, |
| ) |
| .map_err(Error::CreateProxyDevice)?, |
| )) |
| } |
| #[cfg(windows)] |
| Some(_) => unreachable!(), |
| None => Arc::new(Mutex::new(fw_cfg)), |
| }; |
| |
| io_bus |
| .insert(fw_cfg, FW_CFG_BASE_PORT, FW_CFG_WIDTH) |
| .map_err(Error::InsertBus)?; |
| |
| Ok(()) |
| } |
| |
| /// Sets up the legacy x86 i8042/KBD platform device |
| /// |
| /// # Arguments |
| /// |
| /// * - `io_bus` - the IO bus object |
| /// * - `pit_uses_speaker_port` - does the PIT use port 0x61 for the PC speaker |
| /// * - `vm_evt_wrtube` - the event object which should receive exit events |
| pub fn setup_legacy_i8042_device( |
| io_bus: &Bus, |
| pit_uses_speaker_port: bool, |
| vm_evt_wrtube: SendTube, |
| ) -> Result<()> { |
| let i8042 = Arc::new(Mutex::new(devices::I8042Device::new( |
| vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?, |
| ))); |
| |
| if pit_uses_speaker_port { |
| io_bus.insert(i8042, 0x062, 0x3).unwrap(); |
| } else { |
| io_bus.insert(i8042, 0x061, 0x4).unwrap(); |
| } |
| |
| Ok(()) |
| } |
| |
| /// Sets up the legacy x86 CMOS/RTC platform device |
| /// # Arguments |
| /// |
| /// * - `io_bus` - the IO bus object |
| /// * - `mem_size` - the size in bytes of physical ram for the guest |
| pub fn setup_legacy_cmos_device( |
| arch_memory_layout: &ArchMemoryLayout, |
| io_bus: &Bus, |
| irq_chip: &mut dyn IrqChipX86_64, |
| vm_control: Tube, |
| mem_size: u64, |
| ) -> anyhow::Result<()> { |
| let mem_regions = arch_memory_regions(arch_memory_layout, mem_size, None); |
| |
| let mem_below_4g = mem_regions |
| .iter() |
| .filter(|r| r.0.offset() < FIRST_ADDR_PAST_32BITS) |
| .map(|r| r.1) |
| .sum(); |
| |
| let mem_above_4g = mem_regions |
| .iter() |
| .filter(|r| r.0.offset() >= FIRST_ADDR_PAST_32BITS) |
| .map(|r| r.1) |
| .sum(); |
| |
| let irq_evt = devices::IrqEdgeEvent::new().context("cmos irq")?; |
| let cmos = devices::cmos::Cmos::new( |
| mem_below_4g, |
| mem_above_4g, |
| Utc::now, |
| vm_control, |
| irq_evt.try_clone().context("cmos irq clone")?, |
| ) |
| .context("create cmos")?; |
| |
| irq_chip |
| .register_edge_irq_event( |
| devices::cmos::RTC_IRQ as u32, |
| &irq_evt, |
| IrqEventSource::from_device(&cmos), |
| ) |
| .context("cmos register irq")?; |
| io_bus |
| .insert(Arc::new(Mutex::new(cmos)), 0x70, 0x2) |
| .context("cmos insert irq")?; |
| |
| Ok(()) |
| } |
| |
| /// Sets up the acpi devices for this platform and |
| /// return the resources which is used to set the ACPI tables. |
| /// |
| /// # Arguments |
| /// |
| /// * `io_bus` the I/O bus to add the devices to |
| /// * `resources` the SystemAllocator to allocate IO and MMIO for acpi devices. |
| /// * `suspend_tube` the tube object which used to suspend/resume the VM. |
| /// * `sdts` ACPI system description tables |
| /// * `irq_chip` the IrqChip object for registering irq events |
| /// * `battery` indicate whether to create the battery |
| /// * `mmio_bus` the MMIO bus to add the devices to |
| /// * `pci_irqs` IRQ assignment of PCI devices. Tuples of (PCI address, gsi, PCI interrupt pin). |
| /// Note that this matches one of the return values of generate_pci_root. |
| pub fn setup_acpi_devices( |
| arch_memory_layout: &ArchMemoryLayout, |
| pci_root: Arc<Mutex<PciRoot>>, |
| mem: &GuestMemory, |
| io_bus: &Bus, |
| resources: &mut SystemAllocator, |
| suspend_tube: Arc<Mutex<SendTube>>, |
| vm_evt_wrtube: SendTube, |
| sdts: Vec<SDT>, |
| irq_chip: &mut dyn IrqChip, |
| sci_irq: u32, |
| battery: (Option<BatteryType>, Option<Minijail>), |
| #[cfg_attr(windows, allow(unused_variables))] mmio_bus: &Bus, |
| max_bus: u8, |
| resume_notify_devices: &mut Vec<Arc<Mutex<dyn BusResumeDevice>>>, |
| #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, |
| #[cfg(any(target_os = "android", target_os = "linux"))] ac_adapter: bool, |
| guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>, |
| pci_irqs: &[(PciAddress, u32, PciInterruptPin)], |
| ) -> Result<(acpi::AcpiDevResource, Option<BatControl>)> { |
| // The AML data for the acpi devices |
| let mut amls = Vec::new(); |
| |
| let bat_control = if let Some(battery_type) = battery.0 { |
| match battery_type { |
| #[cfg(any(target_os = "android", target_os = "linux"))] |
| BatteryType::Goldfish => { |
| let irq_num = resources.allocate_irq().ok_or(Error::CreateBatDevices( |
| arch::DeviceRegistrationError::AllocateIrq, |
| ))?; |
| let (control_tube, _mmio_base) = arch::sys::linux::add_goldfish_battery( |
| &mut amls, |
| battery.1, |
| mmio_bus, |
| irq_chip, |
| irq_num, |
| resources, |
| #[cfg(feature = "swap")] |
| swap_controller, |
| ) |
| .map_err(Error::CreateBatDevices)?; |
| Some(BatControl { |
| type_: BatteryType::Goldfish, |
| control_tube, |
| }) |
| } |
| #[cfg(windows)] |
| _ => None, |
| } |
| } else { |
| None |
| }; |
| |
| let pm_alloc = resources.get_anon_alloc(); |
| let pm_iobase = match resources.io_allocator() { |
| Some(io) => io |
| .allocate_with_align( |
| devices::acpi::ACPIPM_RESOURCE_LEN as u64, |
| pm_alloc, |
| "ACPIPM".to_string(), |
| 4, // must be 32-bit aligned |
| ) |
| .map_err(Error::AllocateIOResouce)?, |
| None => 0x600, |
| }; |
| |
| let pcie_vcfg = aml::Name::new( |
| "VCFG".into(), |
| &Self::get_pcie_vcfg_mmio_range(mem, &arch_memory_layout.pcie_cfg_mmio).start, |
| ); |
| pcie_vcfg.to_aml_bytes(&mut amls); |
| |
| let pm_sci_evt = devices::IrqLevelEvent::new().map_err(Error::CreateEvent)?; |
| |
| #[cfg(any(target_os = "android", target_os = "linux"))] |
| let acdc = if ac_adapter { |
| // Allocate GPE for AC adapter notfication |
| let gpe = resources.allocate_gpe().ok_or(Error::AllocateGpe)?; |
| |
| let alloc = resources.get_anon_alloc(); |
| let mmio_base = resources |
| .allocate_mmio( |
| devices::ac_adapter::ACDC_VIRT_MMIO_SIZE, |
| alloc, |
| "AcAdapter".to_string(), |
| resources::AllocOptions::new().align(devices::ac_adapter::ACDC_VIRT_MMIO_SIZE), |
| ) |
| .unwrap(); |
| let ac_adapter_dev = devices::ac_adapter::AcAdapter::new(mmio_base, gpe); |
| let ac_dev = Arc::new(Mutex::new(ac_adapter_dev)); |
| mmio_bus |
| .insert( |
| ac_dev.clone(), |
| mmio_base, |
| devices::ac_adapter::ACDC_VIRT_MMIO_SIZE, |
| ) |
| .unwrap(); |
| |
| ac_dev.lock().to_aml_bytes(&mut amls); |
| Some(ac_dev) |
| } else { |
| None |
| }; |
| #[cfg(windows)] |
| let acdc = None; |
| |
| //Virtual PMC |
| if let Some(guest_suspended_cvar) = guest_suspended_cvar { |
| let alloc = resources.get_anon_alloc(); |
| let mmio_base = resources |
| .allocate_mmio( |
| devices::pmc_virt::VPMC_VIRT_MMIO_SIZE, |
| alloc, |
| "VirtualPmc".to_string(), |
| resources::AllocOptions::new().align(devices::pmc_virt::VPMC_VIRT_MMIO_SIZE), |
| ) |
| .unwrap(); |
| |
| let pmc_virtio_mmio = |
| Arc::new(Mutex::new(VirtualPmc::new(mmio_base, guest_suspended_cvar))); |
| mmio_bus |
| .insert( |
| pmc_virtio_mmio.clone(), |
| mmio_base, |
| devices::pmc_virt::VPMC_VIRT_MMIO_SIZE, |
| ) |
| .unwrap(); |
| pmc_virtio_mmio.lock().to_aml_bytes(&mut amls); |
| } |
| |
| let mut pmresource = devices::ACPIPMResource::new( |
| pm_sci_evt.try_clone().map_err(Error::CloneEvent)?, |
| suspend_tube, |
| vm_evt_wrtube, |
| acdc, |
| ); |
| pmresource.to_aml_bytes(&mut amls); |
| irq_chip |
| .register_level_irq_event( |
| sci_irq, |
| &pm_sci_evt, |
| IrqEventSource::from_device(&pmresource), |
| ) |
| .map_err(Error::RegisterIrqfd)?; |
| pmresource.start(); |
| |
| let mut crs_entries: Vec<Box<dyn Aml>> = vec![ |
| Box::new(aml::AddressSpace::new_bus_number(0x0u16, max_bus as u16)), |
| Box::new(aml::IO::new(0xcf8, 0xcf8, 1, 0x8)), |
| ]; |
| for r in resources.mmio_pools() { |
| let entry: Box<dyn Aml> = match (u32::try_from(r.start), u32::try_from(r.end)) { |
| (Ok(start), Ok(end)) => Box::new(aml::AddressSpace::new_memory( |
| aml::AddressSpaceCachable::NotCacheable, |
| true, |
| start, |
| end, |
| )), |
| _ => Box::new(aml::AddressSpace::new_memory( |
| aml::AddressSpaceCachable::NotCacheable, |
| true, |
| r.start, |
| r.end, |
| )), |
| }; |
| crs_entries.push(entry); |
| } |
| |
| let prt_entries: Vec<aml::Package> = pci_irqs |
| .iter() |
| .map(|(pci_address, gsi, pci_intr_pin)| { |
| aml::Package::new(vec![ |
| &pci_address.acpi_adr(), |
| &pci_intr_pin.to_mask(), |
| &aml::ZERO, |
| gsi, |
| ]) |
| }) |
| .collect(); |
| |
| aml::Device::new( |
| "_SB_.PC00".into(), |
| vec![ |
| &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A08")), |
| &aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A03")), |
| &aml::Name::new("_ADR".into(), &aml::ZERO), |
| &aml::Name::new("_SEG".into(), &aml::ZERO), |
| &aml::Name::new("_UID".into(), &aml::ZERO), |
| &aml::Name::new("SUPP".into(), &aml::ZERO), |
| &aml::Name::new( |
| "_CRS".into(), |
| &aml::ResourceTemplate::new(crs_entries.iter().map(|b| b.as_ref()).collect()), |
| ), |
| &PciRootOSC {}, |
| &aml::Name::new( |
| "_PRT".into(), |
| &aml::Package::new(prt_entries.iter().map(|p| p as &dyn Aml).collect()), |
| ), |
| ], |
| ) |
| .to_aml_bytes(&mut amls); |
| |
| if let (Some(start), Some(len)) = ( |
| u32::try_from(arch_memory_layout.pcie_cfg_mmio.start).ok(), |
| arch_memory_layout |
| .pcie_cfg_mmio |
| .len() |
| .and_then(|l| u32::try_from(l).ok()), |
| ) { |
| aml::Device::new( |
| "_SB_.MB00".into(), |
| vec![ |
| &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C02")), |
| &aml::Name::new( |
| "_CRS".into(), |
| &aml::ResourceTemplate::new(vec![&aml::Memory32Fixed::new( |
| true, start, len, |
| )]), |
| ), |
| ], |
| ) |
| .to_aml_bytes(&mut amls); |
| } else { |
| warn!("Failed to create ACPI MMCFG region reservation"); |
| } |
| |
| let root_bus = pci_root.lock().get_root_bus(); |
| let addresses = root_bus.lock().get_downstream_devices(); |
| for address in addresses { |
| if let Some(acpi_path) = pci_root.lock().acpi_path(&address) { |
| const DEEPEST_SLEEP_STATE: u32 = 3; |
| aml::Device::new( |
| (*acpi_path).into(), |
| vec![ |
| &aml::Name::new("_ADR".into(), &address.acpi_adr()), |
| &aml::Name::new( |
| "_PRW".into(), |
| &aml::Package::new(vec![&PM_WAKEUP_GPIO, &DEEPEST_SLEEP_STATE]), |
| ), |
| ], |
| ) |
| .to_aml_bytes(&mut amls); |
| } |
| } |
| |
| let pm = Arc::new(Mutex::new(pmresource)); |
| io_bus |
| .insert( |
| pm.clone(), |
| pm_iobase, |
| devices::acpi::ACPIPM_RESOURCE_LEN as u64, |
| ) |
| .unwrap(); |
| resume_notify_devices.push(pm.clone()); |
| |
| Ok(( |
| acpi::AcpiDevResource { |
| amls, |
| pm_iobase, |
| pm, |
| sdts, |
| }, |
| bat_control, |
| )) |
| } |
| |
| /// Sets up the serial devices for this platform. Returns a list of configured serial devices. |
| /// |
| /// # Arguments |
| /// |
| /// * - `irq_chip` the IrqChip object for registering irq events |
| /// * - `io_bus` the I/O bus to add the devices to |
| /// * - `serial_parameters` - definitions for how the serial devices should be configured |
| pub fn setup_serial_devices( |
| protection_type: ProtectionType, |
| irq_chip: &mut dyn IrqChip, |
| io_bus: &Bus, |
| serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, |
| serial_jail: Option<Minijail>, |
| #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, |
| ) -> Result<Vec<SerialDeviceInfo>> { |
| let com_evt_1_3 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?; |
| let com_evt_2_4 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?; |
| |
| let serial_devices = arch::add_serial_devices( |
| protection_type, |
| io_bus, |
| (X86_64_SERIAL_1_3_IRQ, com_evt_1_3.get_trigger()), |
| (X86_64_SERIAL_2_4_IRQ, com_evt_2_4.get_trigger()), |
| serial_parameters, |
| serial_jail, |
| #[cfg(feature = "swap")] |
| swap_controller, |
| ) |
| .map_err(Error::CreateSerialDevices)?; |
| |
| let source = IrqEventSource { |
| device_id: Serial::device_id(), |
| queue_id: 0, |
| device_name: Serial::debug_label(), |
| }; |
| irq_chip |
| .register_edge_irq_event(X86_64_SERIAL_1_3_IRQ, &com_evt_1_3, source.clone()) |
| .map_err(Error::RegisterIrqfd)?; |
| irq_chip |
| .register_edge_irq_event(X86_64_SERIAL_2_4_IRQ, &com_evt_2_4, source) |
| .map_err(Error::RegisterIrqfd)?; |
| |
| Ok(serial_devices) |
| } |
| |
| fn setup_debugcon_devices( |
| protection_type: ProtectionType, |
| io_bus: &Bus, |
| serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, |
| debugcon_jail: Option<Minijail>, |
| #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, |
| ) -> Result<()> { |
| for param in serial_parameters.values() { |
| if param.hardware != SerialHardware::Debugcon { |
| continue; |
| } |
| |
| let mut preserved_fds = Vec::new(); |
| let con = param |
| .create_serial_device::<Debugcon>( |
| protection_type, |
| // Debugcon doesn't use the interrupt event |
| &Event::new().map_err(Error::CreateEvent)?, |
| &mut preserved_fds, |
| ) |
| .map_err(Error::CreateDebugconDevice)?; |
| |
| let con: Arc<Mutex<dyn BusDevice>> = match debugcon_jail.as_ref() { |
| #[cfg(any(target_os = "android", target_os = "linux"))] |
| Some(jail) => { |
| let jail_clone = jail.try_clone().map_err(Error::CloneJail)?; |
| #[cfg(feature = "seccomp_trace")] |
| debug!( |
| "seccomp_trace {{\"event\": \"minijail_clone\", \"src_jail_addr\": \"0x{:x}\", \"dst_jail_addr\": \"0x{:x}\"}}", |
| read_jail_addr(jail), |
| read_jail_addr(&jail_clone) |
| ); |
| Arc::new(Mutex::new( |
| ProxyDevice::new( |
| con, |
| jail_clone, |
| preserved_fds, |
| #[cfg(feature = "swap")] |
| swap_controller, |
| ) |
| .map_err(Error::CreateProxyDevice)?, |
| )) |
| } |
| #[cfg(windows)] |
| Some(_) => unreachable!(), |
| None => Arc::new(Mutex::new(con)), |
| }; |
| io_bus |
| .insert(con.clone(), param.debugcon_port.into(), 1) |
| .map_err(Error::InsertBus)?; |
| } |
| |
| Ok(()) |
| } |
| } |
| |
| #[sorted] |
| #[derive(Error, Debug)] |
| pub enum MsrError { |
| #[error("CPU not support. Only intel CPUs support ITMT.")] |
| CpuUnSupport, |
| #[error("msr must be unique: {0}")] |
| MsrDuplicate(u32), |
| } |
| |
| #[derive(Error, Debug)] |
| pub enum HybridSupportError { |
| #[error("Host CPU doesn't support hybrid architecture.")] |
| UnsupportedHostCpu, |
| } |
| |
| /// The wrapper for CPUID call functions. |
| pub struct CpuIdCall { |
| /// __cpuid_count or a fake function for test. |
| cpuid_count: unsafe fn(u32, u32) -> CpuidResult, |
| /// __cpuid or a fake function for test. |
| cpuid: unsafe fn(u32) -> CpuidResult, |
| } |
| |
| impl CpuIdCall { |
| pub fn new( |
| cpuid_count: unsafe fn(u32, u32) -> CpuidResult, |
| cpuid: unsafe fn(u32) -> CpuidResult, |
| ) -> CpuIdCall { |
| CpuIdCall { cpuid_count, cpuid } |
| } |
| } |
| |
| /// Check if host supports hybrid CPU feature. The check include: |
| /// 1. Check if CPUID.1AH exists. CPUID.1AH is hybrid information enumeration leaf. |
| /// 2. Check if CPUID.07H.00H:EDX[bit 15] sets. This bit means the processor is identified as a |
| /// hybrid part. |
| /// 3. Check if CPUID.1AH:EAX sets. The hybrid core type is set in EAX. |
| /// |
| /// # Arguments |
| /// |
| /// * - `cpuid` the wrapped cpuid functions used to get CPUID info. |
| pub fn check_host_hybrid_support(cpuid: &CpuIdCall) -> std::result::Result<(), HybridSupportError> { |
| // CPUID.0H.EAX returns maximum input value for basic CPUID information. |
| // |
| // SAFETY: |
| // Safe because we pass 0 for this call and the host supports the |
| // `cpuid` instruction. |
| let mut cpuid_entry = unsafe { (cpuid.cpuid)(0x0) }; |
| if cpuid_entry.eax < 0x1A { |
| return Err(HybridSupportError::UnsupportedHostCpu); |
| } |
| // SAFETY: |
| // Safe because we pass 0x7 and 0 for this call and the host supports the |
| // `cpuid` instruction. |
| cpuid_entry = unsafe { (cpuid.cpuid_count)(0x7, 0) }; |
| if cpuid_entry.edx & 1 << EDX_HYBRID_CPU_SHIFT == 0 { |
| return Err(HybridSupportError::UnsupportedHostCpu); |
| } |
| // From SDM, if a value entered for CPUID.EAX is less than or equal to the |
| // maximum input value and the leaf is not supported on that processor then |
| // 0 is returned in all the registers. |
| // For the CPU with hybrid support, its CPUID.1AH.EAX shouldn't be zero. |
| // |
| // SAFETY: |
| // Safe because we pass 0 for this call and the host supports the |
| // `cpuid` instruction. |
| cpuid_entry = unsafe { (cpuid.cpuid)(0x1A) }; |
| if cpuid_entry.eax == 0 { |
| return Err(HybridSupportError::UnsupportedHostCpu); |
| } |
| Ok(()) |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use std::mem::size_of; |
| |
| use super::*; |
| |
| fn setup() -> ArchMemoryLayout { |
| let pci_config = PciConfig { |
| ecam: Some(MemoryRegionConfig { |
| start: 3 * GB, |
| size: Some(256 * MB), |
| }), |
| mem: Some(MemoryRegionConfig { |
| start: 2 * GB, |
| size: None, |
| }), |
| }; |
| create_arch_memory_layout(&pci_config, false).unwrap() |
| } |
| |
| #[test] |
| fn regions_lt_4gb_nobios() { |
| let arch_memory_layout = setup(); |
| let regions = arch_memory_regions(&arch_memory_layout, 512 * MB, /* bios_size */ None); |
| assert_eq!( |
| regions, |
| [ |
| ( |
| GuestAddress(0), |
| 640 * KB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::GuestMemoryRegion, |
| file_backed: None, |
| }, |
| ), |
| ( |
| GuestAddress(640 * KB), |
| 384 * KB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::ReservedMemory, |
| file_backed: None, |
| }, |
| ), |
| ( |
| GuestAddress(1 * MB), |
| 512 * MB - 1 * MB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::GuestMemoryRegion, |
| file_backed: None, |
| }, |
| ) |
| ] |
| ); |
| } |
| |
| #[test] |
| fn regions_gt_4gb_nobios() { |
| let arch_memory_layout = setup(); |
| let size = 4 * GB + 0x8000; |
| let regions = arch_memory_regions(&arch_memory_layout, size, /* bios_size */ None); |
| assert_eq!( |
| regions, |
| [ |
| ( |
| GuestAddress(0), |
| 640 * KB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::GuestMemoryRegion, |
| file_backed: None, |
| }, |
| ), |
| ( |
| GuestAddress(640 * KB), |
| 384 * KB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::ReservedMemory, |
| file_backed: None, |
| }, |
| ), |
| ( |
| GuestAddress(1 * MB), |
| 2 * GB - 1 * MB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::GuestMemoryRegion, |
| file_backed: None, |
| }, |
| ), |
| ( |
| GuestAddress(4 * GB), |
| 2 * GB + 0x8000, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::GuestMemoryRegion, |
| file_backed: None, |
| }, |
| ), |
| ] |
| ); |
| } |
| |
| #[test] |
| fn regions_lt_4gb_bios() { |
| let arch_memory_layout = setup(); |
| let bios_len = 1 * MB; |
| let regions = arch_memory_regions(&arch_memory_layout, 512 * MB, Some(bios_len)); |
| assert_eq!( |
| regions, |
| [ |
| ( |
| GuestAddress(0), |
| 640 * KB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::GuestMemoryRegion, |
| file_backed: None, |
| }, |
| ), |
| ( |
| GuestAddress(640 * KB), |
| 384 * KB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::ReservedMemory, |
| file_backed: None, |
| }, |
| ), |
| ( |
| GuestAddress(1 * MB), |
| 512 * MB - 1 * MB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::GuestMemoryRegion, |
| file_backed: None, |
| }, |
| ), |
| ( |
| GuestAddress(4 * GB - bios_len), |
| bios_len, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::Bios, |
| file_backed: None, |
| }, |
| ), |
| ] |
| ); |
| } |
| |
| #[test] |
| fn regions_gt_4gb_bios() { |
| let arch_memory_layout = setup(); |
| let bios_len = 1 * MB; |
| let regions = arch_memory_regions(&arch_memory_layout, 4 * GB + 0x8000, Some(bios_len)); |
| assert_eq!( |
| regions, |
| [ |
| ( |
| GuestAddress(0), |
| 640 * KB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::GuestMemoryRegion, |
| file_backed: None, |
| }, |
| ), |
| ( |
| GuestAddress(640 * KB), |
| 384 * KB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::ReservedMemory, |
| file_backed: None, |
| }, |
| ), |
| ( |
| GuestAddress(1 * MB), |
| 2 * GB - 1 * MB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::GuestMemoryRegion, |
| file_backed: None, |
| }, |
| ), |
| ( |
| GuestAddress(4 * GB - bios_len), |
| bios_len, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::Bios, |
| file_backed: None, |
| }, |
| ), |
| ( |
| GuestAddress(4 * GB), |
| 2 * GB + 0x8000, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::GuestMemoryRegion, |
| file_backed: None, |
| }, |
| ), |
| ] |
| ); |
| } |
| |
| #[test] |
| fn regions_eq_4gb_nobios() { |
| let arch_memory_layout = setup(); |
| // Test with exact size of 4GB - the overhead. |
| let regions = arch_memory_regions(&arch_memory_layout, 2 * GB, /* bios_size */ None); |
| assert_eq!( |
| regions, |
| [ |
| ( |
| GuestAddress(0), |
| 640 * KB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::GuestMemoryRegion, |
| file_backed: None, |
| }, |
| ), |
| ( |
| GuestAddress(640 * KB), |
| 384 * KB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::ReservedMemory, |
| file_backed: None, |
| }, |
| ), |
| ( |
| GuestAddress(1 * MB), |
| 2 * GB - 1 * MB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::GuestMemoryRegion, |
| file_backed: None, |
| }, |
| ) |
| ] |
| ); |
| } |
| |
| #[test] |
| fn regions_eq_4gb_bios() { |
| let arch_memory_layout = setup(); |
| // Test with exact size of 4GB - the overhead. |
| let bios_len = 1 * MB; |
| let regions = arch_memory_regions(&arch_memory_layout, 2 * GB, Some(bios_len)); |
| assert_eq!( |
| regions, |
| [ |
| ( |
| GuestAddress(0), |
| 640 * KB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::GuestMemoryRegion, |
| file_backed: None, |
| }, |
| ), |
| ( |
| GuestAddress(640 * KB), |
| 384 * KB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::ReservedMemory, |
| file_backed: None, |
| }, |
| ), |
| ( |
| GuestAddress(1 * MB), |
| 2 * GB - 1 * MB, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::GuestMemoryRegion, |
| file_backed: None, |
| }, |
| ), |
| ( |
| GuestAddress(4 * GB - bios_len), |
| bios_len, |
| MemoryRegionOptions { |
| align: 0, |
| purpose: MemoryRegionPurpose::Bios, |
| file_backed: None, |
| }, |
| ), |
| ] |
| ); |
| } |
| |
| #[test] |
| fn check_pci_mmio_layout() { |
| let arch_memory_layout = setup(); |
| |
| assert_eq!(arch_memory_layout.pci_mmio_before_32bit.start, 2 * GB); |
| assert_eq!(arch_memory_layout.pcie_cfg_mmio.start, 3 * GB); |
| assert_eq!(arch_memory_layout.pcie_cfg_mmio.len().unwrap(), 256 * MB); |
| } |
| |
| #[test] |
| fn check_32bit_gap_size_alignment() { |
| let arch_memory_layout = setup(); |
| // pci_mmio_before_32bit is 256 MB aligned to be friendly for MTRR mappings. |
| assert_eq!( |
| arch_memory_layout.pci_mmio_before_32bit.start % (256 * MB), |
| 0 |
| ); |
| } |
| |
| #[test] |
| fn write_setup_data_empty() { |
| let mem = GuestMemory::new(&[(GuestAddress(0), 0x2_0000)]).unwrap(); |
| let setup_data = []; |
| let setup_data_addr = write_setup_data( |
| &mem, |
| GuestAddress(0x1000), |
| GuestAddress(0x2000), |
| &setup_data, |
| ) |
| .expect("write_setup_data"); |
| assert_eq!(setup_data_addr, None); |
| } |
| |
| #[test] |
| fn write_setup_data_two_of_them() { |
| let mem = GuestMemory::new(&[(GuestAddress(0), 0x2_0000)]).unwrap(); |
| |
| let entry1_addr = GuestAddress(0x1000); |
| let entry1_next_addr = entry1_addr; |
| let entry1_len_addr = entry1_addr.checked_add(12).unwrap(); |
| let entry1_data_addr = entry1_addr.checked_add(16).unwrap(); |
| let entry1_data = [0x55u8; 13]; |
| let entry1_size = (size_of::<setup_data_hdr>() + entry1_data.len()) as u64; |
| let entry1_align = 3; |
| |
| let entry2_addr = GuestAddress(entry1_addr.offset() + entry1_size + entry1_align); |
| let entry2_next_addr = entry2_addr; |
| let entry2_len_addr = entry2_addr.checked_add(12).unwrap(); |
| let entry2_data_addr = entry2_addr.checked_add(16).unwrap(); |
| let entry2_data = [0xAAu8; 9]; |
| |
| let setup_data = [ |
| SetupData { |
| data: entry1_data.to_vec(), |
| type_: SetupDataType::Dtb, |
| }, |
| SetupData { |
| data: entry2_data.to_vec(), |
| type_: SetupDataType::Dtb, |
| }, |
| ]; |
| |
| let setup_data_head_addr = write_setup_data( |
| &mem, |
| GuestAddress(0x1000), |
| GuestAddress(0x2000), |
| &setup_data, |
| ) |
| .expect("write_setup_data"); |
| assert_eq!(setup_data_head_addr, Some(entry1_addr)); |
| |
| assert_eq!( |
| mem.read_obj_from_addr::<u64>(entry1_next_addr).unwrap(), |
| entry2_addr.offset() |
| ); |
| assert_eq!( |
| mem.read_obj_from_addr::<u32>(entry1_len_addr).unwrap(), |
| entry1_data.len() as u32 |
| ); |
| assert_eq!( |
| mem.read_obj_from_addr::<[u8; 13]>(entry1_data_addr) |
| .unwrap(), |
| entry1_data |
| ); |
| |
| assert_eq!(mem.read_obj_from_addr::<u64>(entry2_next_addr).unwrap(), 0); |
| assert_eq!( |
| mem.read_obj_from_addr::<u32>(entry2_len_addr).unwrap(), |
| entry2_data.len() as u32 |
| ); |
| assert_eq!( |
| mem.read_obj_from_addr::<[u8; 9]>(entry2_data_addr).unwrap(), |
| entry2_data |
| ); |
| } |
| |
| #[test] |
| fn cmdline_overflow() { |
| const MEM_SIZE: u64 = 0x1000; |
| let gm = GuestMemory::new(&[(GuestAddress(0x0), MEM_SIZE)]).unwrap(); |
| let mut cmdline = kernel_cmdline::Cmdline::new(); |
| cmdline.insert_str("12345").unwrap(); |
| let cmdline_address = GuestAddress(MEM_SIZE - 5); |
| let err = |
| X8664arch::load_cmdline(&gm, cmdline_address, cmdline, CMDLINE_MAX_SIZE as usize - 1) |
| .unwrap_err(); |
| assert!(matches!(err, Error::CommandLineOverflow)); |
| } |
| |
| #[test] |
| fn cmdline_write_end() { |
| const MEM_SIZE: u64 = 0x1000; |
| let gm = GuestMemory::new(&[(GuestAddress(0x0), MEM_SIZE)]).unwrap(); |
| let mut cmdline = kernel_cmdline::Cmdline::new(); |
| cmdline.insert_str("1234").unwrap(); |
| let mut cmdline_address = GuestAddress(45); |
| X8664arch::load_cmdline(&gm, cmdline_address, cmdline, CMDLINE_MAX_SIZE as usize - 1) |
| .unwrap(); |
| let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap(); |
| assert_eq!(val, b'1'); |
| cmdline_address = cmdline_address.unchecked_add(1); |
| let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap(); |
| assert_eq!(val, b'2'); |
| cmdline_address = cmdline_address.unchecked_add(1); |
| let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap(); |
| assert_eq!(val, b'3'); |
| cmdline_address = cmdline_address.unchecked_add(1); |
| let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap(); |
| assert_eq!(val, b'4'); |
| cmdline_address = cmdline_address.unchecked_add(1); |
| let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap(); |
| assert_eq!(val, b'\0'); |
| } |
| } |