blob: ff2d3d59c5d1eede4b6250c30fb70196444e6d53 [file] [log] [blame] [edit]
// Copyright 2022 The ChromiumOS Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//! PageHandler manages the page states of multiple regions.
#![deny(missing_docs)]
use std::fs::File;
use std::mem;
use std::ops::Range;
use std::sync::Arc;
use anyhow::Context;
use base::error;
use base::linux::FileDataIterator;
use base::AsRawDescriptor;
use base::SharedMemory;
use base::VolatileSlice;
use sync::Mutex;
use thiserror::Error as ThisError;
use crate::file::Error as FileError;
use crate::file::SwapFile;
use crate::pagesize::addr_to_page_idx;
use crate::pagesize::bytes_to_pages;
use crate::pagesize::is_hugepage_aligned;
use crate::pagesize::is_page_aligned;
use crate::pagesize::page_base_addr;
use crate::pagesize::page_idx_to_addr;
use crate::pagesize::pages_to_bytes;
use crate::pagesize::round_up_hugepage_size;
use crate::pagesize::THP_SIZE;
use crate::staging::CopyOp;
use crate::staging::Error as StagingError;
use crate::staging::StagingMemory;
use crate::userfaultfd::Error as UffdError;
use crate::userfaultfd::Userfaultfd;
use crate::worker::Channel;
use crate::worker::Task;
use crate::SwapMetrics;
pub(crate) const MLOCK_BUDGET: usize = 16 * 1024 * 1024; // = 16MB
const PREFETCH_THRESHOLD: usize = 4 * 1024 * 1024; // = 4MB
/// Result for PageHandler
pub type Result<T> = std::result::Result<T, Error>;
/// Errors for PageHandler
#[derive(ThisError, Debug)]
pub enum Error {
#[error("the address is invalid {0:#018X}")]
/// the address is invalid
InvalidAddress(usize),
#[error("the regions {0:?} and {1:?} overlap")]
/// regions are overlaps on registering
RegionOverlap(Range<usize>, Range<usize>),
#[error("failed to create page handler {0:?}")]
/// failed to create page handler
CreateFailed(anyhow::Error),
#[error("file operation failed : {0:?}")]
/// file operation failed
File(#[from] FileError),
#[error("staging operation failed : {0:?}")]
/// staging operation failed
Staging(#[from] StagingError),
#[error("userfaultfd failed : {0:?}")]
/// userfaultfd operation failed
Userfaultfd(#[from] UffdError),
#[error("failed to iterate data ranges: {0:?}")]
/// FileDataIterator failed
FileDataIterator(#[from] base::Error),
}
/// Remove the memory range on the guest memory.
///
/// This is an alternative to [vm_memory::GuestMemory::remove_range()] when working with host
/// addresses instead of guest addresses.
///
/// # Safety
///
/// The memory range must be on the guest memory.
#[deny(unsafe_op_in_unsafe_fn)]
unsafe fn remove_memory(addr: usize, len: usize) -> std::result::Result<(), base::Error> {
// SAFETY:
// Safe because the caller guarantees addr is in guest memory, so this does not affect any rust
// managed memory.
let ret = unsafe { libc::madvise(addr as *mut libc::c_void, len, libc::MADV_REMOVE) };
if ret < 0 {
base::errno_result()
} else {
Ok(())
}
}
fn uffd_copy_all(
uffd: &Userfaultfd,
mut page_addr: usize,
mut data_slice: VolatileSlice,
wake: bool,
) -> std::result::Result<(), UffdError> {
loop {
let result = uffd.copy(page_addr, data_slice.size(), data_slice.as_ptr(), wake);
match result {
Err(UffdError::PartiallyCopied(copied)) => {
page_addr += copied;
data_slice.advance(copied);
}
other => {
// Even EEXIST for copy operation should be an error for page fault handling. If
// the page was swapped in before, the page should be cleared from the swap file
// and do `Userfaultfd::zero()` instead.
return other.map(|_| ());
}
}
}
}
/// [Region] represents a memory region and corresponding [SwapFile].
struct Region {
/// the head page index of the region.
head_page_idx: usize,
base_page_idx_in_file: usize,
num_pages: usize,
staging_memory: StagingMemory,
copied_from_file_pages: usize,
copied_from_staging_pages: usize,
zeroed_pages: usize,
swap_in_pages: usize,
/// the amount of pages which were already initialized on page faults.
redundant_pages: usize,
}
/// MoveToStaging copies chunks of consecutive pages next to each other on the guest memory to the
/// staging memory and removes the chunks on the guest memory.
pub struct MoveToStaging {
remove_area: Range<usize>,
copies: Vec<CopyOp>,
}
impl Task for MoveToStaging {
fn execute(self) {
for copy_op in self.copies {
copy_op.execute();
}
// Remove chunks of pages at once to reduce madvise(2) syscall.
// SAFETY:
// Safe because the region is already backed by the file and the content will be
// swapped in on a page fault.
let result = unsafe {
remove_memory(
self.remove_area.start,
self.remove_area.end - self.remove_area.start,
)
};
if let Err(e) = result {
panic!("failed to remove memory: {:?}", e);
}
}
}
struct PageHandleContext<'a> {
file: SwapFile<'a>,
regions: Vec<Region>,
mlock_budget_pages: usize,
}
/// PageHandler manages the page states of multiple regions.
///
/// Handles multiple events derived from userfaultfd and swap out requests.
/// All the addresses and sizes in bytes are converted to page id internally.
pub struct PageHandler<'a> {
ctx: Mutex<PageHandleContext<'a>>,
channel: Arc<Channel<MoveToStaging>>,
}
impl<'a> PageHandler<'a> {
/// Creates [PageHandler] for the given region.
///
/// If any of regions overlaps, this returns [Error::RegionOverlap].
///
/// # Arguments
///
/// * `swap_file` - The swap file.
/// * `staging_shmem` - The staging memory. It must have enough size to hold guest memory.
/// Otherwise monitor process crashes on creating a mmap.
/// * `address_ranges` - The list of address range of the regions. the start address must align
/// with page. the size must be multiple of pagesize.
pub fn create(
swap_file: &'a File,
staging_shmem: &'a SharedMemory,
address_ranges: &[Range<usize>],
stating_move_context: Arc<Channel<MoveToStaging>>,
) -> Result<Self> {
// Truncate the file into the size to hold all regions, otherwise access beyond the end of
// file may cause SIGBUS.
swap_file
.set_len(
address_ranges
.iter()
.map(|r| (r.end.saturating_sub(r.start)) as u64)
.sum(),
)
.context("truncate swap file")
.map_err(Error::CreateFailed)?;
let mut regions: Vec<Region> = Vec::new();
let mut offset_pages = 0;
for address_range in address_ranges {
let head_page_idx = addr_to_page_idx(address_range.start);
if address_range.end < address_range.start {
return Err(Error::CreateFailed(anyhow::anyhow!(
"invalid region end < start"
)));
}
let region_size = address_range.end - address_range.start;
let num_pages = bytes_to_pages(region_size);
// Find an overlapping region
match regions.iter().position(|region| {
if region.head_page_idx < head_page_idx {
region.head_page_idx + region.num_pages > head_page_idx
} else {
region.head_page_idx < head_page_idx + num_pages
}
}) {
Some(i) => {
let region = &regions[i];
return Err(Error::RegionOverlap(
address_range.clone(),
page_idx_to_addr(region.head_page_idx)
..(page_idx_to_addr(region.head_page_idx + region.num_pages)),
));
}
None => {
let base_addr = address_range.start;
assert!(is_page_aligned(base_addr));
assert!(is_page_aligned(region_size));
let staging_memory = StagingMemory::new(
staging_shmem,
pages_to_bytes(offset_pages) as u64,
num_pages,
)?;
regions.push(Region {
head_page_idx,
base_page_idx_in_file: offset_pages,
num_pages,
staging_memory,
copied_from_file_pages: 0,
copied_from_staging_pages: 0,
zeroed_pages: 0,
swap_in_pages: 0,
redundant_pages: 0,
});
offset_pages += num_pages;
}
}
}
let file = SwapFile::new(swap_file, offset_pages)?;
Ok(Self {
ctx: Mutex::new(PageHandleContext {
file,
regions,
mlock_budget_pages: bytes_to_pages(MLOCK_BUDGET),
}),
channel: stating_move_context,
})
}
fn find_region(regions: &mut [Region], page_idx: usize) -> Option<&mut Region> {
// sequential search the corresponding page map from the list. It should be fast enough
// because there are a few regions (usually only 1).
regions.iter_mut().find(|region| {
region.head_page_idx <= page_idx && page_idx < region.head_page_idx + region.num_pages
})
}
/// Fills the faulted page with zero if the page is not initialized, with the content in the
/// swap file if the page is swapped out.
///
/// # Arguments
///
/// * `uffd` - the reference to the [Userfaultfd] for the faulting process.
/// * `address` - the address that triggered the page fault.
pub fn handle_page_fault(&self, uffd: &Userfaultfd, address: usize) -> Result<()> {
let page_idx = addr_to_page_idx(address);
// the head address of the page.
let page_addr = page_base_addr(address);
let page_size = pages_to_bytes(1);
let mut ctx = self.ctx.lock();
let PageHandleContext { regions, file, .. } = &mut *ctx;
let region = Self::find_region(regions, page_idx).ok_or(Error::InvalidAddress(address))?;
let idx_in_region = page_idx - region.head_page_idx;
let idx_in_file = idx_in_region + region.base_page_idx_in_file;
if let Some(page_slice) = region.staging_memory.page_content(idx_in_region)? {
uffd_copy_all(uffd, page_addr, page_slice, true)?;
// TODO(b/265758094): optimize clear operation.
region
.staging_memory
.clear_range(idx_in_region..idx_in_region + 1)?;
region.copied_from_staging_pages += 1;
Ok(())
} else if let Some(page_slice) = file.page_content(idx_in_file, false)? {
// TODO(kawasin): Unlock regions to proceed swap-in operation background.
uffd_copy_all(uffd, page_addr, page_slice, true)?;
// TODO(b/265758094): optimize clear operation.
// Do not erase the page from the disk for trimming optimization on next swap out.
let munlocked_pages = file.clear_range(idx_in_file..idx_in_file + 1)?;
region.copied_from_file_pages += 1;
ctx.mlock_budget_pages += munlocked_pages;
Ok(())
} else {
// Map a zero page since no swap file has been created yet but the fault
// happened.
// safe because the fault page is notified by uffd.
let result = uffd.zero(page_addr, page_size, true);
match result {
Ok(_) => {
region.zeroed_pages += 1;
Ok(())
}
Err(UffdError::PageExist) => {
// This case can happen if page faults on the same page happen on different
// processes.
uffd.wake(page_addr, page_size)?;
region.redundant_pages += 1;
Ok(())
}
Err(e) => Err(e.into()),
}
}
}
/// Clear the internal state for the pages.
///
/// When pages are removed by madvise with `MADV_DONTNEED` or `MADV_REMOVE`, userfaultfd
/// notifies the event as `UFFD_EVENT_REMOVE`. This handles the remove event.
///
/// In crosvm, balloon frees the guest memory and cause `UFFD_EVENT_REMOVE`.
///
/// # Arguments
///
/// * `start_addr` - the head address of the memory area to be freed.
/// * `end_addr` - the end address of the memory area to be freed. `UFFD_EVENT_REMOVE` tells the
/// head address of the next memory area of the freed area. (i.e. the exact tail address of
/// the memory area is `end_addr - 1`.)
pub fn handle_page_remove(&self, start_addr: usize, end_addr: usize) -> Result<()> {
if !is_page_aligned(start_addr) {
return Err(Error::InvalidAddress(start_addr));
} else if !is_page_aligned(end_addr) {
return Err(Error::InvalidAddress(end_addr));
}
let start_page_idx = addr_to_page_idx(start_addr);
let last_page_idx = addr_to_page_idx(end_addr);
let mut ctx = self.ctx.lock();
// TODO(b/269983521): Clear multiple pages in the same region at once.
for page_idx in start_page_idx..(last_page_idx) {
let page_addr = page_idx_to_addr(page_idx);
// TODO(kawasin): Cache the position if the range does not span multiple regions.
let region = Self::find_region(&mut ctx.regions, page_idx)
.ok_or(Error::InvalidAddress(page_addr))?;
let idx_in_region = page_idx - region.head_page_idx;
let idx_range = idx_in_region..idx_in_region + 1;
if let Err(e) = region.staging_memory.clear_range(idx_range) {
error!("failed to clear removed page from staging: {:?}", e);
}
let idx_in_file = idx_in_region + region.base_page_idx_in_file;
let idx_range = idx_in_file..idx_in_file + 1;
// Erase the pages from the disk because the pages are removed from the guest memory.
let munlocked_pages = ctx.file.free_range(idx_range)?;
ctx.mlock_budget_pages += munlocked_pages;
}
Ok(())
}
/// Move active pages in the memory region to the staging memory.
///
/// It only moves active contents in the guest memory to the swap file and skips empty pages
/// (e.g. pages not touched, freed by balloon) using `lseek(2)` + `SEEK_HOLE/DATA`.
///
/// Returns the count of moved out pages.
///
/// # Arguments
///
/// * `base_addr` - the head address of the memory region.
/// * `memfd` - the file descriptor of the memfd backing the guest memory region.
/// * `base_offset` - the offset of the memory region in the memfd.
///
/// # Safety
///
/// The region must have been registered to all userfaultfd of processes which may touch the
/// region.
///
/// The memory must be protected not to be updated while moving.
///
/// The page fault events for the region from the userfaultfd must be handled by
/// [Self::handle_page_fault()].
///
/// Must call [Channel::wait_complete()] to wait all the copy operation complete within the
/// memory protection period.
#[deny(unsafe_op_in_unsafe_fn)]
pub unsafe fn move_to_staging<T>(
&self,
base_addr: usize,
memfd: &T,
base_offset: u64,
) -> Result<usize>
where
T: AsRawDescriptor,
{
let hugepage_size = *THP_SIZE;
let mut ctx = self.ctx.lock();
let region = Self::find_region(&mut ctx.regions, addr_to_page_idx(base_addr))
.ok_or(Error::InvalidAddress(base_addr))?;
if page_idx_to_addr(region.head_page_idx) != base_addr {
return Err(Error::InvalidAddress(base_addr));
}
let region_size = pages_to_bytes(region.num_pages);
let mut file_data = FileDataIterator::new(memfd, base_offset, region_size as u64);
let mut moved_size = 0;
let mut copies = Vec::new();
let mut remaining_batch_size = hugepage_size;
let mut batch_head_offset = 0;
let mut cur_data = None;
while let Some(data_range) = cur_data
.take()
.map(Ok)
.or_else(|| file_data.next())
.transpose()
.map_err(Error::FileDataIterator)?
{
// Assert offset is page aligned
let offset = (data_range.start - base_offset) as usize;
assert!(is_page_aligned(offset));
// The chunk size must be within usize since the chunk is within the guest memory.
let chunk_size = (data_range.end - data_range.start) as usize;
let data_range = if chunk_size > remaining_batch_size {
// Split the chunk if it is bigger than remaining_batch_size.
let split_size = if chunk_size >= hugepage_size {
// If the chunk size is bigger than or equals to huge page size, the chunk may
// contains a huge page. If we MADV_REMOVE a huge page partially, it can cause
// inconsistency between the actual page table and vmm-swap internal state.
let chunk_addr = base_addr + offset;
if !is_hugepage_aligned(chunk_addr) {
// Split the chunk before the where a huge page could start.
std::cmp::min(
round_up_hugepage_size(chunk_addr) - chunk_addr,
remaining_batch_size,
)
} else {
if remaining_batch_size < hugepage_size {
// Remove the batch since it does not have enough room for a huge page.
self.channel.push(MoveToStaging {
remove_area: base_addr + batch_head_offset..base_addr + offset,
copies: mem::take(&mut copies),
});
remaining_batch_size = hugepage_size;
batch_head_offset = offset;
}
hugepage_size
}
} else {
remaining_batch_size
};
// Cache the rest of splitted chunk to avoid useless lseek(2) syscall.
cur_data = Some(data_range.start + split_size as u64..data_range.end);
data_range.start..data_range.start + split_size as u64
} else {
data_range
};
let size = (data_range.end - data_range.start) as usize;
assert!(is_page_aligned(size));
// SAFETY:
// Safe because:
// * src_addr is aligned with page size
// * the data_range starting from src_addr is on the guest memory.
let copy_op = unsafe {
region.staging_memory.copy(
(base_addr + offset) as *const u8,
bytes_to_pages(offset),
bytes_to_pages(size),
)?
};
copies.push(copy_op);
moved_size += size;
// The size must be smaller than or equals to remaining_batch_size.
remaining_batch_size -= size;
if remaining_batch_size == 0 {
// Remove the batch of pages at once to reduce madvise(2) syscall.
self.channel.push(MoveToStaging {
remove_area: base_addr + batch_head_offset..base_addr + offset + size,
copies: mem::take(&mut copies),
});
remaining_batch_size = hugepage_size;
batch_head_offset = offset + size;
}
}
// Remove the final batch of pages.
self.channel.push(MoveToStaging {
remove_area: base_addr + batch_head_offset..base_addr + region_size,
copies,
});
region.copied_from_file_pages = 0;
region.copied_from_staging_pages = 0;
region.zeroed_pages = 0;
region.swap_in_pages = 0;
region.redundant_pages = 0;
Ok(bytes_to_pages(moved_size))
}
/// Write a chunk of consecutive pages in the staging memory to the swap file.
///
/// If there is no active pages in the staging memory, this returns `Ok(0)`.
///
/// The pages in guest memory have been moved to staging memory by [Self::move_to_staging()].
///
/// Returns the count of swapped out pages.
///
/// Even if swap_out fails on any internal steps, it does not break the page state management
/// and `PageHandler` can continue working with a little pages leaking in staging memory or swap
/// file. The leaked pages are removed when vmm-swap is disabled and `PageHandler` is dropped.
///
/// # Arguments
///
/// * `max_size` - the upper limit of the chunk size to write into the swap file at once. The
/// chunk is splitted if it is bigger than `max_size`.
pub fn swap_out(&self, max_size: usize) -> Result<usize> {
let max_pages = bytes_to_pages(max_size);
let mut ctx = self.ctx.lock();
let PageHandleContext { regions, file, .. } = &mut *ctx;
for region in regions.iter_mut() {
if let Some(idx_range) = region.staging_memory.first_data_range(max_pages) {
let idx_range_in_file = idx_range.start + region.base_page_idx_in_file
..idx_range.end + region.base_page_idx_in_file;
let pages = idx_range.end - idx_range.start;
let slice = region.staging_memory.get_slice(idx_range.clone())?;
// Convert VolatileSlice to &[u8]
// SAFETY:
// Safe because the range of volatile slice is already validated.
let slice = unsafe { std::slice::from_raw_parts(slice.as_ptr(), slice.size()) };
file.write_to_file(idx_range_in_file.start, slice)?;
// TODO(kawasin): clear state_list on each write and MADV_REMOVE several chunk at
// once.
region.staging_memory.clear_range(idx_range)?;
// TODO(kawasin): free the page cache of the swap file.
// TODO(kawasin): use writev() to swap_out several small chunks at once.
return Ok(pages);
}
}
Ok(0)
}
/// Create a new [SwapInContext].
pub fn start_swap_in(&'a self) -> SwapInContext<'a> {
SwapInContext {
ctx: &self.ctx,
cur_staging: 0,
}
}
/// Create a new [TrimContext].
pub fn start_trim(&'a self) -> TrimContext<'a> {
TrimContext {
ctx: &self.ctx,
cur_page: 0,
cur_region: 0,
next_data_in_file: 0..0,
clean_pages: 0,
zero_pages: 0,
}
}
/// Returns count of pages copied from vmm-swap file to the guest memory.
fn compute_copied_from_file_pages(&self) -> usize {
self.ctx
.lock()
.regions
.iter()
.map(|r| r.copied_from_file_pages)
.sum()
}
/// Returns count of pages copied from staging memory to the guest memory.
fn compute_copied_from_staging_pages(&self) -> usize {
self.ctx
.lock()
.regions
.iter()
.map(|r| r.copied_from_staging_pages)
.sum()
}
/// Returns count of pages initialized with zero.
fn compute_zeroed_pages(&self) -> usize {
self.ctx.lock().regions.iter().map(|r| r.zeroed_pages).sum()
}
/// Returns count of pages which were already initialized on page faults.
fn compute_redundant_pages(&self) -> usize {
self.ctx
.lock()
.regions
.iter()
.map(|r| r.redundant_pages)
.sum()
}
/// Returns count of pages present in the staging memory.
fn compute_staging_pages(&self) -> usize {
self.ctx
.lock()
.regions
.iter()
.map(|r| r.staging_memory.present_pages())
.sum()
}
/// Returns count of pages present in the swap files.
fn compute_swap_pages(&self) -> usize {
self.ctx.lock().file.present_pages()
}
/// Fill [SwapMetrics] with page handler metrics.
pub fn load_metrics(&self, metrics: &mut SwapMetrics) {
metrics.copied_from_file_pages = self.compute_copied_from_file_pages() as u64;
metrics.copied_from_staging_pages = self.compute_copied_from_staging_pages() as u64;
metrics.zeroed_pages = self.compute_zeroed_pages() as u64;
metrics.redundant_pages = self.compute_redundant_pages() as u64;
metrics.staging_pages = self.compute_staging_pages() as u64;
metrics.swap_pages = self.compute_swap_pages() as u64;
}
}
/// Context for swap-in operation.
///
/// This holds cursor of indices in the regions for each step for optimization.
pub struct SwapInContext<'a> {
ctx: &'a Mutex<PageHandleContext<'a>>,
cur_staging: usize,
}
impl SwapInContext<'_> {
/// Swap in a chunk of consecutive pages from the staging memory and the swap file.
///
/// If there is no more pages present outside of the guest memory, this returns `Ok(0)`.
///
/// Returns the count of swapped in pages.
///
/// # Arguments
///
/// * `uffd` - the main [Userfaultfd].
/// * `max_size` - the upper limit of the chunk size to swap into the guest memory at once. The
/// chunk is splitted if it is bigger than `max_size`.
pub fn swap_in(&mut self, uffd: &Userfaultfd, max_size: usize) -> Result<usize> {
let mut ctx = self.ctx.lock();
// Request the kernel to pre-populate the present pages in the swap file to page cache
// background. At most 16MB of pages will be populated.
// The threshold is to apply MADV_WILLNEED to bigger chunk of pages. The kernel populates
// consecutive pages at once on MADV_WILLNEED.
if ctx.mlock_budget_pages > bytes_to_pages(PREFETCH_THRESHOLD) {
let mlock_budget_pages = ctx.mlock_budget_pages;
let locked_pages = ctx.file.lock_and_async_prefetch(mlock_budget_pages)?;
ctx.mlock_budget_pages -= locked_pages;
}
let max_pages = bytes_to_pages(max_size);
for region in ctx.regions[self.cur_staging..].iter_mut() {
// TODO(kawasin): swap_in multiple chunks less than max_size at once.
if let Some(idx_range) = region.staging_memory.first_data_range(max_pages) {
let pages = idx_range.end - idx_range.start;
let page_addr = page_idx_to_addr(region.head_page_idx + idx_range.start);
let slice = region.staging_memory.get_slice(idx_range.clone())?;
uffd_copy_all(uffd, page_addr, slice, false)?;
// Clear the staging memory to avoid memory spike.
// TODO(kawasin): reduce the call count of MADV_REMOVE by removing several data
// at once.
region.staging_memory.clear_range(idx_range)?;
region.swap_in_pages += pages;
return Ok(pages);
}
self.cur_staging += 1;
}
if let Some(mut idx_range_in_file) = ctx.file.first_data_range(max_pages) {
let PageHandleContext { regions, file, .. } = &mut *ctx;
for region in regions.iter_mut() {
let region_tail_idx_in_file = region.base_page_idx_in_file + region.num_pages;
if idx_range_in_file.start >= region_tail_idx_in_file {
continue;
} else if idx_range_in_file.start < region.base_page_idx_in_file {
return Err(Error::File(FileError::OutOfRange));
} else if idx_range_in_file.end > region_tail_idx_in_file {
// The consecutive pages can be across regions. Swap-in pages in a region at
// once.
idx_range_in_file.end = region_tail_idx_in_file;
}
let pages = idx_range_in_file.end - idx_range_in_file.start;
let page_addr = page_idx_to_addr(
idx_range_in_file.start - region.base_page_idx_in_file + region.head_page_idx,
);
let slice = file.get_slice(idx_range_in_file.clone())?;
// TODO(kawasin): Unlock regions to proceed page fault handling on the main thread.
// We also need to handle the EEXIST error from UFFD_COPY.
uffd_copy_all(uffd, page_addr, slice, false)?;
// Do not erase each chunk of pages from disk on swap_in. The whole file will be
// truncated when swap_in is completed. Even if swap_in is aborted, the remaining
// disk contents help the trimming optimization on swap_out.
let munlocked_pages = file.clear_range(idx_range_in_file)?;
region.swap_in_pages += pages;
ctx.mlock_budget_pages += munlocked_pages;
return Ok(pages);
}
// File has remaining pages, but regions has been consumed.
return Err(Error::File(FileError::OutOfRange));
}
Ok(0)
}
}
impl Drop for SwapInContext<'_> {
fn drop(&mut self) {
let mut ctx = self.ctx.lock();
if let Err(e) = ctx.file.clear_mlock() {
panic!("failed to clear mlock: {:?}", e);
}
ctx.mlock_budget_pages = bytes_to_pages(MLOCK_BUDGET);
}
}
/// Context for trim operation.
///
/// This drops 2 types of pages in the staging memory to reduce disk write.
///
/// * Clean pages
/// * The pages which have been swapped out to the disk and have not been changed.
/// * Drop the pages in the staging memory and mark it as present on the swap file.
/// * Zero pages
/// * Drop the pages in the staging memory. The pages will be UFFD_ZEROed on page fault.
pub struct TrimContext<'a> {
ctx: &'a Mutex<PageHandleContext<'a>>,
cur_region: usize,
cur_page: usize,
/// The page idx range of pages which have been stored in the swap file.
next_data_in_file: Range<usize>,
clean_pages: usize,
zero_pages: usize,
}
impl TrimContext<'_> {
/// Trim pages in the staging memory.
///
/// This returns the pages trimmed. This returns `None` if it traversed all pages in the staging
/// memory.
///
/// # Arguments
///
/// `max_size` - The maximum pages to be compared.
pub fn trim_pages(&mut self, max_pages: usize) -> anyhow::Result<Option<usize>> {
let mut ctx = self.ctx.lock();
if self.cur_region >= ctx.regions.len() {
return Ok(None);
}
let PageHandleContext { regions, file, .. } = &mut *ctx;
let region = &mut regions[self.cur_region];
let mut n_trimmed = 0;
for _ in 0..max_pages {
if let Some(slice_in_staging) = region
.staging_memory
.page_content(self.cur_page)
.context("get page of staging memory")?
{
let idx_range = self.cur_page..self.cur_page + 1;
let idx_in_file = idx_range.start + region.base_page_idx_in_file;
// Check zero page on the staging memory first. If the page is non-zero and have not
// been changed, zero checking is useless, but less cost than file I/O for the pages
// which were in the swap file and now is zero.
// Check 2 types of page in the same loop to utilize CPU cache for staging memory.
if slice_in_staging.is_all_zero() {
region
.staging_memory
.clear_range(idx_range.clone())
.context("clear a page in staging memory")?;
// The page is on the swap file as well.
let munlocked_pages = file
.free_range(idx_in_file..idx_in_file + 1)
.context("clear a page in swap file")?;
if munlocked_pages != 0 {
// Only either of swap-in or trimming runs at the same time. This is not
// expected path. Just logging an error because leaking
// mlock_budget_pages is not fatal.
error!("pages are mlock(2)ed while trimming");
}
n_trimmed += 1;
self.zero_pages += 1;
} else if let Some(slice_in_file) = file.page_content(idx_in_file, true)? {
// Compare the page with the previous content of the page on the disk.
if slice_in_staging == slice_in_file {
region
.staging_memory
.clear_range(idx_range.clone())
.context("clear a page in staging memory")?;
file.mark_as_present(idx_in_file)?;
n_trimmed += 1;
self.clean_pages += 1;
}
}
}
self.cur_page += 1;
if self.cur_page >= region.num_pages {
self.cur_region += 1;
self.cur_page = 0;
self.next_data_in_file = 0..0;
break;
}
}
Ok(Some(n_trimmed))
}
/// Total trimmed clean pages.
pub fn trimmed_clean_pages(&self) -> usize {
self.clean_pages
}
/// Total trimmed zero pages.
pub fn trimmed_zero_pages(&self) -> usize {
self.zero_pages
}
}