| // Copyright 2022 The ChromiumOS Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| //! PageHandler manages the page states of multiple regions. |
| |
| #![deny(missing_docs)] |
| |
| use std::fs::File; |
| use std::mem; |
| use std::ops::Range; |
| use std::sync::Arc; |
| |
| use anyhow::Context; |
| use base::error; |
| use base::linux::FileDataIterator; |
| use base::AsRawDescriptor; |
| use base::SharedMemory; |
| use base::VolatileSlice; |
| use sync::Mutex; |
| use thiserror::Error as ThisError; |
| |
| use crate::file::Error as FileError; |
| use crate::file::SwapFile; |
| use crate::pagesize::addr_to_page_idx; |
| use crate::pagesize::bytes_to_pages; |
| use crate::pagesize::is_hugepage_aligned; |
| use crate::pagesize::is_page_aligned; |
| use crate::pagesize::page_base_addr; |
| use crate::pagesize::page_idx_to_addr; |
| use crate::pagesize::pages_to_bytes; |
| use crate::pagesize::round_up_hugepage_size; |
| use crate::pagesize::THP_SIZE; |
| use crate::staging::CopyOp; |
| use crate::staging::Error as StagingError; |
| use crate::staging::StagingMemory; |
| use crate::userfaultfd::Error as UffdError; |
| use crate::userfaultfd::Userfaultfd; |
| use crate::worker::Channel; |
| use crate::worker::Task; |
| use crate::SwapMetrics; |
| |
| pub(crate) const MLOCK_BUDGET: usize = 16 * 1024 * 1024; // = 16MB |
| const PREFETCH_THRESHOLD: usize = 4 * 1024 * 1024; // = 4MB |
| |
| /// Result for PageHandler |
| pub type Result<T> = std::result::Result<T, Error>; |
| |
| /// Errors for PageHandler |
| #[derive(ThisError, Debug)] |
| pub enum Error { |
| #[error("the address is invalid {0:#018X}")] |
| /// the address is invalid |
| InvalidAddress(usize), |
| #[error("the regions {0:?} and {1:?} overlap")] |
| /// regions are overlaps on registering |
| RegionOverlap(Range<usize>, Range<usize>), |
| #[error("failed to create page handler {0:?}")] |
| /// failed to create page handler |
| CreateFailed(anyhow::Error), |
| #[error("file operation failed : {0:?}")] |
| /// file operation failed |
| File(#[from] FileError), |
| #[error("staging operation failed : {0:?}")] |
| /// staging operation failed |
| Staging(#[from] StagingError), |
| #[error("userfaultfd failed : {0:?}")] |
| /// userfaultfd operation failed |
| Userfaultfd(#[from] UffdError), |
| #[error("failed to iterate data ranges: {0:?}")] |
| /// FileDataIterator failed |
| FileDataIterator(#[from] base::Error), |
| } |
| |
| /// Remove the memory range on the guest memory. |
| /// |
| /// This is an alternative to [vm_memory::GuestMemory::remove_range()] when working with host |
| /// addresses instead of guest addresses. |
| /// |
| /// # Safety |
| /// |
| /// The memory range must be on the guest memory. |
| #[deny(unsafe_op_in_unsafe_fn)] |
| unsafe fn remove_memory(addr: usize, len: usize) -> std::result::Result<(), base::Error> { |
| // SAFETY: |
| // Safe because the caller guarantees addr is in guest memory, so this does not affect any rust |
| // managed memory. |
| let ret = unsafe { libc::madvise(addr as *mut libc::c_void, len, libc::MADV_REMOVE) }; |
| if ret < 0 { |
| base::errno_result() |
| } else { |
| Ok(()) |
| } |
| } |
| |
| fn uffd_copy_all( |
| uffd: &Userfaultfd, |
| mut page_addr: usize, |
| mut data_slice: VolatileSlice, |
| wake: bool, |
| ) -> std::result::Result<(), UffdError> { |
| loop { |
| let result = uffd.copy(page_addr, data_slice.size(), data_slice.as_ptr(), wake); |
| match result { |
| Err(UffdError::PartiallyCopied(copied)) => { |
| page_addr += copied; |
| data_slice.advance(copied); |
| } |
| other => { |
| // Even EEXIST for copy operation should be an error for page fault handling. If |
| // the page was swapped in before, the page should be cleared from the swap file |
| // and do `Userfaultfd::zero()` instead. |
| return other.map(|_| ()); |
| } |
| } |
| } |
| } |
| |
| /// [Region] represents a memory region and corresponding [SwapFile]. |
| struct Region { |
| /// the head page index of the region. |
| head_page_idx: usize, |
| base_page_idx_in_file: usize, |
| num_pages: usize, |
| staging_memory: StagingMemory, |
| copied_from_file_pages: usize, |
| copied_from_staging_pages: usize, |
| zeroed_pages: usize, |
| swap_in_pages: usize, |
| /// the amount of pages which were already initialized on page faults. |
| redundant_pages: usize, |
| } |
| |
| /// MoveToStaging copies chunks of consecutive pages next to each other on the guest memory to the |
| /// staging memory and removes the chunks on the guest memory. |
| pub struct MoveToStaging { |
| remove_area: Range<usize>, |
| copies: Vec<CopyOp>, |
| } |
| |
| impl Task for MoveToStaging { |
| fn execute(self) { |
| for copy_op in self.copies { |
| copy_op.execute(); |
| } |
| // Remove chunks of pages at once to reduce madvise(2) syscall. |
| // SAFETY: |
| // Safe because the region is already backed by the file and the content will be |
| // swapped in on a page fault. |
| let result = unsafe { |
| remove_memory( |
| self.remove_area.start, |
| self.remove_area.end - self.remove_area.start, |
| ) |
| }; |
| if let Err(e) = result { |
| panic!("failed to remove memory: {:?}", e); |
| } |
| } |
| } |
| |
| struct PageHandleContext<'a> { |
| file: SwapFile<'a>, |
| regions: Vec<Region>, |
| mlock_budget_pages: usize, |
| } |
| |
| /// PageHandler manages the page states of multiple regions. |
| /// |
| /// Handles multiple events derived from userfaultfd and swap out requests. |
| /// All the addresses and sizes in bytes are converted to page id internally. |
| pub struct PageHandler<'a> { |
| ctx: Mutex<PageHandleContext<'a>>, |
| channel: Arc<Channel<MoveToStaging>>, |
| } |
| |
| impl<'a> PageHandler<'a> { |
| /// Creates [PageHandler] for the given region. |
| /// |
| /// If any of regions overlaps, this returns [Error::RegionOverlap]. |
| /// |
| /// # Arguments |
| /// |
| /// * `swap_file` - The swap file. |
| /// * `staging_shmem` - The staging memory. It must have enough size to hold guest memory. |
| /// Otherwise monitor process crashes on creating a mmap. |
| /// * `address_ranges` - The list of address range of the regions. the start address must align |
| /// with page. the size must be multiple of pagesize. |
| pub fn create( |
| swap_file: &'a File, |
| staging_shmem: &'a SharedMemory, |
| address_ranges: &[Range<usize>], |
| stating_move_context: Arc<Channel<MoveToStaging>>, |
| ) -> Result<Self> { |
| // Truncate the file into the size to hold all regions, otherwise access beyond the end of |
| // file may cause SIGBUS. |
| swap_file |
| .set_len( |
| address_ranges |
| .iter() |
| .map(|r| (r.end.saturating_sub(r.start)) as u64) |
| .sum(), |
| ) |
| .context("truncate swap file") |
| .map_err(Error::CreateFailed)?; |
| |
| let mut regions: Vec<Region> = Vec::new(); |
| let mut offset_pages = 0; |
| for address_range in address_ranges { |
| let head_page_idx = addr_to_page_idx(address_range.start); |
| if address_range.end < address_range.start { |
| return Err(Error::CreateFailed(anyhow::anyhow!( |
| "invalid region end < start" |
| ))); |
| } |
| let region_size = address_range.end - address_range.start; |
| let num_pages = bytes_to_pages(region_size); |
| |
| // Find an overlapping region |
| match regions.iter().position(|region| { |
| if region.head_page_idx < head_page_idx { |
| region.head_page_idx + region.num_pages > head_page_idx |
| } else { |
| region.head_page_idx < head_page_idx + num_pages |
| } |
| }) { |
| Some(i) => { |
| let region = ®ions[i]; |
| |
| return Err(Error::RegionOverlap( |
| address_range.clone(), |
| page_idx_to_addr(region.head_page_idx) |
| ..(page_idx_to_addr(region.head_page_idx + region.num_pages)), |
| )); |
| } |
| None => { |
| let base_addr = address_range.start; |
| assert!(is_page_aligned(base_addr)); |
| assert!(is_page_aligned(region_size)); |
| |
| let staging_memory = StagingMemory::new( |
| staging_shmem, |
| pages_to_bytes(offset_pages) as u64, |
| num_pages, |
| )?; |
| regions.push(Region { |
| head_page_idx, |
| base_page_idx_in_file: offset_pages, |
| num_pages, |
| staging_memory, |
| copied_from_file_pages: 0, |
| copied_from_staging_pages: 0, |
| zeroed_pages: 0, |
| swap_in_pages: 0, |
| redundant_pages: 0, |
| }); |
| offset_pages += num_pages; |
| } |
| } |
| } |
| |
| let file = SwapFile::new(swap_file, offset_pages)?; |
| |
| Ok(Self { |
| ctx: Mutex::new(PageHandleContext { |
| file, |
| regions, |
| mlock_budget_pages: bytes_to_pages(MLOCK_BUDGET), |
| }), |
| channel: stating_move_context, |
| }) |
| } |
| |
| fn find_region(regions: &mut [Region], page_idx: usize) -> Option<&mut Region> { |
| // sequential search the corresponding page map from the list. It should be fast enough |
| // because there are a few regions (usually only 1). |
| regions.iter_mut().find(|region| { |
| region.head_page_idx <= page_idx && page_idx < region.head_page_idx + region.num_pages |
| }) |
| } |
| |
| /// Fills the faulted page with zero if the page is not initialized, with the content in the |
| /// swap file if the page is swapped out. |
| /// |
| /// # Arguments |
| /// |
| /// * `uffd` - the reference to the [Userfaultfd] for the faulting process. |
| /// * `address` - the address that triggered the page fault. |
| pub fn handle_page_fault(&self, uffd: &Userfaultfd, address: usize) -> Result<()> { |
| let page_idx = addr_to_page_idx(address); |
| // the head address of the page. |
| let page_addr = page_base_addr(address); |
| let page_size = pages_to_bytes(1); |
| let mut ctx = self.ctx.lock(); |
| let PageHandleContext { regions, file, .. } = &mut *ctx; |
| let region = Self::find_region(regions, page_idx).ok_or(Error::InvalidAddress(address))?; |
| |
| let idx_in_region = page_idx - region.head_page_idx; |
| let idx_in_file = idx_in_region + region.base_page_idx_in_file; |
| if let Some(page_slice) = region.staging_memory.page_content(idx_in_region)? { |
| uffd_copy_all(uffd, page_addr, page_slice, true)?; |
| // TODO(b/265758094): optimize clear operation. |
| region |
| .staging_memory |
| .clear_range(idx_in_region..idx_in_region + 1)?; |
| region.copied_from_staging_pages += 1; |
| Ok(()) |
| } else if let Some(page_slice) = file.page_content(idx_in_file, false)? { |
| // TODO(kawasin): Unlock regions to proceed swap-in operation background. |
| uffd_copy_all(uffd, page_addr, page_slice, true)?; |
| // TODO(b/265758094): optimize clear operation. |
| // Do not erase the page from the disk for trimming optimization on next swap out. |
| let munlocked_pages = file.clear_range(idx_in_file..idx_in_file + 1)?; |
| region.copied_from_file_pages += 1; |
| ctx.mlock_budget_pages += munlocked_pages; |
| Ok(()) |
| } else { |
| // Map a zero page since no swap file has been created yet but the fault |
| // happened. |
| // safe because the fault page is notified by uffd. |
| let result = uffd.zero(page_addr, page_size, true); |
| match result { |
| Ok(_) => { |
| region.zeroed_pages += 1; |
| Ok(()) |
| } |
| Err(UffdError::PageExist) => { |
| // This case can happen if page faults on the same page happen on different |
| // processes. |
| uffd.wake(page_addr, page_size)?; |
| region.redundant_pages += 1; |
| Ok(()) |
| } |
| Err(e) => Err(e.into()), |
| } |
| } |
| } |
| |
| /// Clear the internal state for the pages. |
| /// |
| /// When pages are removed by madvise with `MADV_DONTNEED` or `MADV_REMOVE`, userfaultfd |
| /// notifies the event as `UFFD_EVENT_REMOVE`. This handles the remove event. |
| /// |
| /// In crosvm, balloon frees the guest memory and cause `UFFD_EVENT_REMOVE`. |
| /// |
| /// # Arguments |
| /// |
| /// * `start_addr` - the head address of the memory area to be freed. |
| /// * `end_addr` - the end address of the memory area to be freed. `UFFD_EVENT_REMOVE` tells the |
| /// head address of the next memory area of the freed area. (i.e. the exact tail address of |
| /// the memory area is `end_addr - 1`.) |
| pub fn handle_page_remove(&self, start_addr: usize, end_addr: usize) -> Result<()> { |
| if !is_page_aligned(start_addr) { |
| return Err(Error::InvalidAddress(start_addr)); |
| } else if !is_page_aligned(end_addr) { |
| return Err(Error::InvalidAddress(end_addr)); |
| } |
| let start_page_idx = addr_to_page_idx(start_addr); |
| let last_page_idx = addr_to_page_idx(end_addr); |
| let mut ctx = self.ctx.lock(); |
| // TODO(b/269983521): Clear multiple pages in the same region at once. |
| for page_idx in start_page_idx..(last_page_idx) { |
| let page_addr = page_idx_to_addr(page_idx); |
| // TODO(kawasin): Cache the position if the range does not span multiple regions. |
| let region = Self::find_region(&mut ctx.regions, page_idx) |
| .ok_or(Error::InvalidAddress(page_addr))?; |
| let idx_in_region = page_idx - region.head_page_idx; |
| let idx_range = idx_in_region..idx_in_region + 1; |
| if let Err(e) = region.staging_memory.clear_range(idx_range) { |
| error!("failed to clear removed page from staging: {:?}", e); |
| } |
| let idx_in_file = idx_in_region + region.base_page_idx_in_file; |
| let idx_range = idx_in_file..idx_in_file + 1; |
| // Erase the pages from the disk because the pages are removed from the guest memory. |
| let munlocked_pages = ctx.file.free_range(idx_range)?; |
| ctx.mlock_budget_pages += munlocked_pages; |
| } |
| Ok(()) |
| } |
| |
| /// Move active pages in the memory region to the staging memory. |
| /// |
| /// It only moves active contents in the guest memory to the swap file and skips empty pages |
| /// (e.g. pages not touched, freed by balloon) using `lseek(2)` + `SEEK_HOLE/DATA`. |
| /// |
| /// Returns the count of moved out pages. |
| /// |
| /// # Arguments |
| /// |
| /// * `base_addr` - the head address of the memory region. |
| /// * `memfd` - the file descriptor of the memfd backing the guest memory region. |
| /// * `base_offset` - the offset of the memory region in the memfd. |
| /// |
| /// # Safety |
| /// |
| /// The region must have been registered to all userfaultfd of processes which may touch the |
| /// region. |
| /// |
| /// The memory must be protected not to be updated while moving. |
| /// |
| /// The page fault events for the region from the userfaultfd must be handled by |
| /// [Self::handle_page_fault()]. |
| /// |
| /// Must call [Channel::wait_complete()] to wait all the copy operation complete within the |
| /// memory protection period. |
| #[deny(unsafe_op_in_unsafe_fn)] |
| pub unsafe fn move_to_staging<T>( |
| &self, |
| base_addr: usize, |
| memfd: &T, |
| base_offset: u64, |
| ) -> Result<usize> |
| where |
| T: AsRawDescriptor, |
| { |
| let hugepage_size = *THP_SIZE; |
| let mut ctx = self.ctx.lock(); |
| let region = Self::find_region(&mut ctx.regions, addr_to_page_idx(base_addr)) |
| .ok_or(Error::InvalidAddress(base_addr))?; |
| |
| if page_idx_to_addr(region.head_page_idx) != base_addr { |
| return Err(Error::InvalidAddress(base_addr)); |
| } |
| let region_size = pages_to_bytes(region.num_pages); |
| let mut file_data = FileDataIterator::new(memfd, base_offset, region_size as u64); |
| let mut moved_size = 0; |
| let mut copies = Vec::new(); |
| let mut remaining_batch_size = hugepage_size; |
| let mut batch_head_offset = 0; |
| let mut cur_data = None; |
| while let Some(data_range) = cur_data |
| .take() |
| .map(Ok) |
| .or_else(|| file_data.next()) |
| .transpose() |
| .map_err(Error::FileDataIterator)? |
| { |
| // Assert offset is page aligned |
| let offset = (data_range.start - base_offset) as usize; |
| assert!(is_page_aligned(offset)); |
| |
| // The chunk size must be within usize since the chunk is within the guest memory. |
| let chunk_size = (data_range.end - data_range.start) as usize; |
| let data_range = if chunk_size > remaining_batch_size { |
| // Split the chunk if it is bigger than remaining_batch_size. |
| |
| let split_size = if chunk_size >= hugepage_size { |
| // If the chunk size is bigger than or equals to huge page size, the chunk may |
| // contains a huge page. If we MADV_REMOVE a huge page partially, it can cause |
| // inconsistency between the actual page table and vmm-swap internal state. |
| let chunk_addr = base_addr + offset; |
| if !is_hugepage_aligned(chunk_addr) { |
| // Split the chunk before the where a huge page could start. |
| std::cmp::min( |
| round_up_hugepage_size(chunk_addr) - chunk_addr, |
| remaining_batch_size, |
| ) |
| } else { |
| if remaining_batch_size < hugepage_size { |
| // Remove the batch since it does not have enough room for a huge page. |
| self.channel.push(MoveToStaging { |
| remove_area: base_addr + batch_head_offset..base_addr + offset, |
| copies: mem::take(&mut copies), |
| }); |
| remaining_batch_size = hugepage_size; |
| batch_head_offset = offset; |
| } |
| hugepage_size |
| } |
| } else { |
| remaining_batch_size |
| }; |
| // Cache the rest of splitted chunk to avoid useless lseek(2) syscall. |
| cur_data = Some(data_range.start + split_size as u64..data_range.end); |
| data_range.start..data_range.start + split_size as u64 |
| } else { |
| data_range |
| }; |
| |
| let size = (data_range.end - data_range.start) as usize; |
| assert!(is_page_aligned(size)); |
| |
| // SAFETY: |
| // Safe because: |
| // * src_addr is aligned with page size |
| // * the data_range starting from src_addr is on the guest memory. |
| let copy_op = unsafe { |
| region.staging_memory.copy( |
| (base_addr + offset) as *const u8, |
| bytes_to_pages(offset), |
| bytes_to_pages(size), |
| )? |
| }; |
| copies.push(copy_op); |
| |
| moved_size += size; |
| // The size must be smaller than or equals to remaining_batch_size. |
| remaining_batch_size -= size; |
| |
| if remaining_batch_size == 0 { |
| // Remove the batch of pages at once to reduce madvise(2) syscall. |
| self.channel.push(MoveToStaging { |
| remove_area: base_addr + batch_head_offset..base_addr + offset + size, |
| copies: mem::take(&mut copies), |
| }); |
| remaining_batch_size = hugepage_size; |
| batch_head_offset = offset + size; |
| } |
| } |
| // Remove the final batch of pages. |
| self.channel.push(MoveToStaging { |
| remove_area: base_addr + batch_head_offset..base_addr + region_size, |
| copies, |
| }); |
| |
| region.copied_from_file_pages = 0; |
| region.copied_from_staging_pages = 0; |
| region.zeroed_pages = 0; |
| region.swap_in_pages = 0; |
| region.redundant_pages = 0; |
| |
| Ok(bytes_to_pages(moved_size)) |
| } |
| |
| /// Write a chunk of consecutive pages in the staging memory to the swap file. |
| /// |
| /// If there is no active pages in the staging memory, this returns `Ok(0)`. |
| /// |
| /// The pages in guest memory have been moved to staging memory by [Self::move_to_staging()]. |
| /// |
| /// Returns the count of swapped out pages. |
| /// |
| /// Even if swap_out fails on any internal steps, it does not break the page state management |
| /// and `PageHandler` can continue working with a little pages leaking in staging memory or swap |
| /// file. The leaked pages are removed when vmm-swap is disabled and `PageHandler` is dropped. |
| /// |
| /// # Arguments |
| /// |
| /// * `max_size` - the upper limit of the chunk size to write into the swap file at once. The |
| /// chunk is splitted if it is bigger than `max_size`. |
| pub fn swap_out(&self, max_size: usize) -> Result<usize> { |
| let max_pages = bytes_to_pages(max_size); |
| let mut ctx = self.ctx.lock(); |
| let PageHandleContext { regions, file, .. } = &mut *ctx; |
| for region in regions.iter_mut() { |
| if let Some(idx_range) = region.staging_memory.first_data_range(max_pages) { |
| let idx_range_in_file = idx_range.start + region.base_page_idx_in_file |
| ..idx_range.end + region.base_page_idx_in_file; |
| let pages = idx_range.end - idx_range.start; |
| let slice = region.staging_memory.get_slice(idx_range.clone())?; |
| // Convert VolatileSlice to &[u8] |
| // SAFETY: |
| // Safe because the range of volatile slice is already validated. |
| let slice = unsafe { std::slice::from_raw_parts(slice.as_ptr(), slice.size()) }; |
| file.write_to_file(idx_range_in_file.start, slice)?; |
| // TODO(kawasin): clear state_list on each write and MADV_REMOVE several chunk at |
| // once. |
| region.staging_memory.clear_range(idx_range)?; |
| // TODO(kawasin): free the page cache of the swap file. |
| // TODO(kawasin): use writev() to swap_out several small chunks at once. |
| return Ok(pages); |
| } |
| } |
| Ok(0) |
| } |
| |
| /// Create a new [SwapInContext]. |
| pub fn start_swap_in(&'a self) -> SwapInContext<'a> { |
| SwapInContext { |
| ctx: &self.ctx, |
| cur_staging: 0, |
| } |
| } |
| |
| /// Create a new [TrimContext]. |
| pub fn start_trim(&'a self) -> TrimContext<'a> { |
| TrimContext { |
| ctx: &self.ctx, |
| cur_page: 0, |
| cur_region: 0, |
| next_data_in_file: 0..0, |
| clean_pages: 0, |
| zero_pages: 0, |
| } |
| } |
| |
| /// Returns count of pages copied from vmm-swap file to the guest memory. |
| fn compute_copied_from_file_pages(&self) -> usize { |
| self.ctx |
| .lock() |
| .regions |
| .iter() |
| .map(|r| r.copied_from_file_pages) |
| .sum() |
| } |
| |
| /// Returns count of pages copied from staging memory to the guest memory. |
| fn compute_copied_from_staging_pages(&self) -> usize { |
| self.ctx |
| .lock() |
| .regions |
| .iter() |
| .map(|r| r.copied_from_staging_pages) |
| .sum() |
| } |
| |
| /// Returns count of pages initialized with zero. |
| fn compute_zeroed_pages(&self) -> usize { |
| self.ctx.lock().regions.iter().map(|r| r.zeroed_pages).sum() |
| } |
| |
| /// Returns count of pages which were already initialized on page faults. |
| fn compute_redundant_pages(&self) -> usize { |
| self.ctx |
| .lock() |
| .regions |
| .iter() |
| .map(|r| r.redundant_pages) |
| .sum() |
| } |
| |
| /// Returns count of pages present in the staging memory. |
| fn compute_staging_pages(&self) -> usize { |
| self.ctx |
| .lock() |
| .regions |
| .iter() |
| .map(|r| r.staging_memory.present_pages()) |
| .sum() |
| } |
| |
| /// Returns count of pages present in the swap files. |
| fn compute_swap_pages(&self) -> usize { |
| self.ctx.lock().file.present_pages() |
| } |
| |
| /// Fill [SwapMetrics] with page handler metrics. |
| pub fn load_metrics(&self, metrics: &mut SwapMetrics) { |
| metrics.copied_from_file_pages = self.compute_copied_from_file_pages() as u64; |
| metrics.copied_from_staging_pages = self.compute_copied_from_staging_pages() as u64; |
| metrics.zeroed_pages = self.compute_zeroed_pages() as u64; |
| metrics.redundant_pages = self.compute_redundant_pages() as u64; |
| metrics.staging_pages = self.compute_staging_pages() as u64; |
| metrics.swap_pages = self.compute_swap_pages() as u64; |
| } |
| } |
| |
| /// Context for swap-in operation. |
| /// |
| /// This holds cursor of indices in the regions for each step for optimization. |
| pub struct SwapInContext<'a> { |
| ctx: &'a Mutex<PageHandleContext<'a>>, |
| cur_staging: usize, |
| } |
| |
| impl SwapInContext<'_> { |
| /// Swap in a chunk of consecutive pages from the staging memory and the swap file. |
| /// |
| /// If there is no more pages present outside of the guest memory, this returns `Ok(0)`. |
| /// |
| /// Returns the count of swapped in pages. |
| /// |
| /// # Arguments |
| /// |
| /// * `uffd` - the main [Userfaultfd]. |
| /// * `max_size` - the upper limit of the chunk size to swap into the guest memory at once. The |
| /// chunk is splitted if it is bigger than `max_size`. |
| pub fn swap_in(&mut self, uffd: &Userfaultfd, max_size: usize) -> Result<usize> { |
| let mut ctx = self.ctx.lock(); |
| // Request the kernel to pre-populate the present pages in the swap file to page cache |
| // background. At most 16MB of pages will be populated. |
| // The threshold is to apply MADV_WILLNEED to bigger chunk of pages. The kernel populates |
| // consecutive pages at once on MADV_WILLNEED. |
| if ctx.mlock_budget_pages > bytes_to_pages(PREFETCH_THRESHOLD) { |
| let mlock_budget_pages = ctx.mlock_budget_pages; |
| let locked_pages = ctx.file.lock_and_async_prefetch(mlock_budget_pages)?; |
| ctx.mlock_budget_pages -= locked_pages; |
| } |
| |
| let max_pages = bytes_to_pages(max_size); |
| for region in ctx.regions[self.cur_staging..].iter_mut() { |
| // TODO(kawasin): swap_in multiple chunks less than max_size at once. |
| if let Some(idx_range) = region.staging_memory.first_data_range(max_pages) { |
| let pages = idx_range.end - idx_range.start; |
| let page_addr = page_idx_to_addr(region.head_page_idx + idx_range.start); |
| let slice = region.staging_memory.get_slice(idx_range.clone())?; |
| uffd_copy_all(uffd, page_addr, slice, false)?; |
| // Clear the staging memory to avoid memory spike. |
| // TODO(kawasin): reduce the call count of MADV_REMOVE by removing several data |
| // at once. |
| region.staging_memory.clear_range(idx_range)?; |
| region.swap_in_pages += pages; |
| return Ok(pages); |
| } |
| self.cur_staging += 1; |
| } |
| |
| if let Some(mut idx_range_in_file) = ctx.file.first_data_range(max_pages) { |
| let PageHandleContext { regions, file, .. } = &mut *ctx; |
| for region in regions.iter_mut() { |
| let region_tail_idx_in_file = region.base_page_idx_in_file + region.num_pages; |
| if idx_range_in_file.start >= region_tail_idx_in_file { |
| continue; |
| } else if idx_range_in_file.start < region.base_page_idx_in_file { |
| return Err(Error::File(FileError::OutOfRange)); |
| } else if idx_range_in_file.end > region_tail_idx_in_file { |
| // The consecutive pages can be across regions. Swap-in pages in a region at |
| // once. |
| idx_range_in_file.end = region_tail_idx_in_file; |
| } |
| let pages = idx_range_in_file.end - idx_range_in_file.start; |
| let page_addr = page_idx_to_addr( |
| idx_range_in_file.start - region.base_page_idx_in_file + region.head_page_idx, |
| ); |
| let slice = file.get_slice(idx_range_in_file.clone())?; |
| // TODO(kawasin): Unlock regions to proceed page fault handling on the main thread. |
| // We also need to handle the EEXIST error from UFFD_COPY. |
| uffd_copy_all(uffd, page_addr, slice, false)?; |
| // Do not erase each chunk of pages from disk on swap_in. The whole file will be |
| // truncated when swap_in is completed. Even if swap_in is aborted, the remaining |
| // disk contents help the trimming optimization on swap_out. |
| let munlocked_pages = file.clear_range(idx_range_in_file)?; |
| region.swap_in_pages += pages; |
| ctx.mlock_budget_pages += munlocked_pages; |
| return Ok(pages); |
| } |
| // File has remaining pages, but regions has been consumed. |
| return Err(Error::File(FileError::OutOfRange)); |
| } |
| |
| Ok(0) |
| } |
| } |
| |
| impl Drop for SwapInContext<'_> { |
| fn drop(&mut self) { |
| let mut ctx = self.ctx.lock(); |
| if let Err(e) = ctx.file.clear_mlock() { |
| panic!("failed to clear mlock: {:?}", e); |
| } |
| ctx.mlock_budget_pages = bytes_to_pages(MLOCK_BUDGET); |
| } |
| } |
| |
| /// Context for trim operation. |
| /// |
| /// This drops 2 types of pages in the staging memory to reduce disk write. |
| /// |
| /// * Clean pages |
| /// * The pages which have been swapped out to the disk and have not been changed. |
| /// * Drop the pages in the staging memory and mark it as present on the swap file. |
| /// * Zero pages |
| /// * Drop the pages in the staging memory. The pages will be UFFD_ZEROed on page fault. |
| pub struct TrimContext<'a> { |
| ctx: &'a Mutex<PageHandleContext<'a>>, |
| cur_region: usize, |
| cur_page: usize, |
| /// The page idx range of pages which have been stored in the swap file. |
| next_data_in_file: Range<usize>, |
| clean_pages: usize, |
| zero_pages: usize, |
| } |
| |
| impl TrimContext<'_> { |
| /// Trim pages in the staging memory. |
| /// |
| /// This returns the pages trimmed. This returns `None` if it traversed all pages in the staging |
| /// memory. |
| /// |
| /// # Arguments |
| /// |
| /// `max_size` - The maximum pages to be compared. |
| pub fn trim_pages(&mut self, max_pages: usize) -> anyhow::Result<Option<usize>> { |
| let mut ctx = self.ctx.lock(); |
| if self.cur_region >= ctx.regions.len() { |
| return Ok(None); |
| } |
| let PageHandleContext { regions, file, .. } = &mut *ctx; |
| let region = &mut regions[self.cur_region]; |
| let mut n_trimmed = 0; |
| |
| for _ in 0..max_pages { |
| if let Some(slice_in_staging) = region |
| .staging_memory |
| .page_content(self.cur_page) |
| .context("get page of staging memory")? |
| { |
| let idx_range = self.cur_page..self.cur_page + 1; |
| let idx_in_file = idx_range.start + region.base_page_idx_in_file; |
| |
| // Check zero page on the staging memory first. If the page is non-zero and have not |
| // been changed, zero checking is useless, but less cost than file I/O for the pages |
| // which were in the swap file and now is zero. |
| // Check 2 types of page in the same loop to utilize CPU cache for staging memory. |
| if slice_in_staging.is_all_zero() { |
| region |
| .staging_memory |
| .clear_range(idx_range.clone()) |
| .context("clear a page in staging memory")?; |
| // The page is on the swap file as well. |
| let munlocked_pages = file |
| .free_range(idx_in_file..idx_in_file + 1) |
| .context("clear a page in swap file")?; |
| if munlocked_pages != 0 { |
| // Only either of swap-in or trimming runs at the same time. This is not |
| // expected path. Just logging an error because leaking |
| // mlock_budget_pages is not fatal. |
| error!("pages are mlock(2)ed while trimming"); |
| } |
| n_trimmed += 1; |
| self.zero_pages += 1; |
| } else if let Some(slice_in_file) = file.page_content(idx_in_file, true)? { |
| // Compare the page with the previous content of the page on the disk. |
| if slice_in_staging == slice_in_file { |
| region |
| .staging_memory |
| .clear_range(idx_range.clone()) |
| .context("clear a page in staging memory")?; |
| file.mark_as_present(idx_in_file)?; |
| n_trimmed += 1; |
| self.clean_pages += 1; |
| } |
| } |
| } |
| |
| self.cur_page += 1; |
| if self.cur_page >= region.num_pages { |
| self.cur_region += 1; |
| self.cur_page = 0; |
| self.next_data_in_file = 0..0; |
| break; |
| } |
| } |
| |
| Ok(Some(n_trimmed)) |
| } |
| |
| /// Total trimmed clean pages. |
| pub fn trimmed_clean_pages(&self) -> usize { |
| self.clean_pages |
| } |
| |
| /// Total trimmed zero pages. |
| pub fn trimmed_zero_pages(&self) -> usize { |
| self.zero_pages |
| } |
| } |