ui/android/texture_compressor/lib.rs - chromium/src - Git at Google

 // Copyright 2025 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #![feature(portable_simd)]

 // Modules public for testing, don't expect stable API.
 mod cxx;
 pub mod dither;
 pub mod quant;
 pub mod selectors;

 use std::simd::prelude::*;
 use std::simd::Simd;

 use bytemuck::cast_slice;

 use crate::dither::dither;
 use crate::quant::{quantize_averages, QuantResult};
 use crate::selectors::search_table_and_selectors;

 // We primarily compute with 16-bit integers and a width of 8 fills a 128-bit
 // wide lane (SSE, NEON). TODO(b/393494744): When we introduce multiversioning
 // and support for AVX2 etc. this should be converted to a template parameter
 // that varies based on the target architecture.
 const SIMD_WIDTH: usize = 8;
 const HALF_WIDTH: usize = SIMD_WIDTH / 2;
 const QUARTER_WIDTH: usize = SIMD_WIDTH / 4;
 type Reg = Simd<i16, SIMD_WIDTH>;
 type Reg32 = Simd<i32, SIMD_WIDTH>;
 type UReg = Simd<u16, SIMD_WIDTH>;

 /// Define a helper to interleave elements from two vectors, reinterpret
 /// it as a type twice as large, and return the resulting vector.
 /// Each argument / return value is an array of vectors; conceptually, this
 /// represents a vector that is <width> * <len> large; however, since std::simd
 /// types have upper limits on their width we represent them using arrays to be
 /// portable.
 macro_rules! define_interleave {
     ($fn_name:ident, $src_ty:ty, $dst_ty:ty, $src_width:expr, $dst_width:expr, $src_len:literal) => {
         fn $fn_name(
             a: [Simd<$src_ty, $src_width>; $src_len],
             b: [Simd<$src_ty, $src_width>; $src_len],
         ) -> [Simd<$dst_ty, $dst_width>; $src_len * 2] {
             let mut iter = (0..$src_len).flat_map(|i| {
                 let (a, b) = a[i].interleave(b[i]);
                 [a, b].map(|x| bytemuck::cast(x))
             });
             let res = std::array::from_fn(|_| iter.next().unwrap());
             assert!(iter.next().is_none());
             res
         }
     };
 }

 /// Convert individual codewords laid out as [15..0, 31..16, 47..32, 63..48]
 /// into interleaved u64 arrays, while flipping the endianness (our internal
 /// representation is little endian while ETC1 requires big endian).
 #[inline]
 pub fn interleave_etc1(regs: [UReg; 4]) -> [Simd<u64, QUARTER_WIDTH>; 4] {
     // The interleaving assumes little endian.
     #[cfg(target_endian = "big")]
     compile_error!("Big endian is not supported");

     define_interleave!(conv_16_to_32, u16, u32, SIMD_WIDTH, HALF_WIDTH, 1);
     define_interleave!(conv_32_to_64, u32, u64, HALF_WIDTH, QUARTER_WIDTH, 2);
     // Step 1: make each u16 codeword big-endian
     let regs = regs.map(|r| r.swap_bytes());
     // Step 2: [aaaa, bbbb] to [baba, baba]
     let regs = [conv_16_to_32([regs[1]], [regs[0]]), conv_16_to_32([regs[3]], [regs[2]])];
     // Step 3: [baba, baba], [dcdc, dcdc] to [dcba, dcba], [dcba, dcba]
     let regs = conv_32_to_64(regs[1], regs[0]);
     regs
 }

 /// Load `SIMD_WIDTH` blocks from a region `4*SIMD_WIDTH` wide and `4` tall,
 /// starting at `base_x` and `base_y`.
 ///
 /// Out of bounds pixels are padded with mirroring. For example, `abcdxy`
 /// becomes `abcdxyyx`.
 ///
 /// Returns a 3D array of SIMD vectors. Each block is mapped to a SIMD lane
 /// (from left to right), and each pixel in the block is accessed as
 /// `[y][x][channel]`.
 #[inline]
 pub fn load_input_block(
     src: &[u32],
     width: u32,
     height: u32,
     row_width: u32,
     base_x: u32,
     base_y: u32,
 ) -> [[[Reg; 3]; 4]; 4] {
     let mut data = [[[Reg::default(); 3]; 4]; 4];
     // For now, input load and output store are not vectorized. The main reason is
     // that efficient loading requires shuffling and is poorly supported
     // by std::simd and the wide crate (which we plan to use for
     // supporting stable toolchain). Input load currently accounts for
     // ~20% of the runtime. If shuffle support improves this would be a
     // good candidate for optimization.
     for i in 0..4 {
         for j in 0..4 {
             let mut buf = [0u32; SIMD_WIDTH];
             for block in 0..SIMD_WIDTH as u32 {
                 let x = base_x + block * 4 + j as u32;
                 let y = base_y + i as u32;
                 buf[block as usize] = if x < width && y < height {
                     // Fast path: load in-bound pixel
                     src[(y * row_width + x) as usize]
                 } else {
                     // Slow path: mirror out-of-bound pixels
                     // If width or height is 1, mirroring can overflow, so make it saturate.
                     let xm = if x >= width { (width - 1).saturating_sub(x - width) } else { x };
                     let ym = if y >= height { (height - 1).saturating_sub(y - height) } else { y };
                     src[(ym * row_width + xm) as usize]
                 };
             }
             let rgbx = Simd::from_array(buf);
             let extract_channel = |x: Simd<u32, SIMD_WIDTH>, shift: u32| {
                 (x >> shift).cast::<i16>() & Simd::splat(0xFF)
             };
             data[i][j][0] = extract_channel(rgbx, 0);
             data[i][j][1] = extract_channel(rgbx, 8);
             data[i][j][2] = extract_channel(rgbx, 16);
         }
     }
     data
 }

 /// Compress RGB pixels to ETC1.
 ///
 /// `src` should be in RGBA.
 /// `dst` will be filled with compressed ETC1 blocks.
 /// `width` and `height` does not need to be multiple of 4. The boundary pixels
 /// will be padded with unspecified values.
 /// `src_row_width` and `dst_row_width` specifies the stride, in units of pixels
 /// and blocks, respectively.
 ///
 /// Note that `src` assumes aligned 32-bit buffer while `dst` does not. This is
 /// due to two reasons: 32-bit alignment is practical to get even on 32-bit
 /// platforms, whereas 64-bit alignment does not hold on 32-bit ARM.
 /// Additionally, we require extensive shuffling when loading inputs, but
 /// stores to the output straight in the order of pixels. Dealing with
 /// unaligned buffers in the latter case is significantly easier.
 pub fn compress_etc1(
     src: &[u32],
     dst: &mut [u8],
     width: u32,
     height: u32,
     src_row_width: u32,
     dst_row_width: u32,
 ) {
     let dst_height = height.div_ceil(4);
     let dst_width = width.div_ceil(4);
     // Aligned staging buffer. Data is copied into the potentially unaligned
     // destination buffer at the end of the each row.
     let mut staging_row = vec![[Simd::splat(0); 4]; (dst_width as usize).div_ceil(SIMD_WIDTH)];
     let copy_len = dst_width as usize * 8;
     // Note on vectorization scheme:
     //
     // We process one 4x4 block per SIMD lane, instead of the more common practice
     // of processing pixels within the same block in parallel using multiple
     // lanes. The one-block-per-lane scheme, more akin to SPMD programming,
     // allows most of our code to be shuffle-free, and works much better with
     // portable SIMD than schemes that heavily shuffles.
     for dst_y in 0..dst_height {
         for dst_x0 in (0..dst_width).step_by(SIMD_WIDTH) {
             let data = load_input_block(src, width, height, src_row_width, dst_x0 * 4, dst_y * 4);

             let data = dither(&data);
             let QuantResult { lo: hdr0, hi: hdr1, scaled0: ep0, scaled1: ep1 } =
                 quantize_averages(&data);
             let best_fit = search_table_and_selectors(hdr0, hdr1, &data, [ep0, ep1]);
             let codewords = interleave_etc1(best_fit);
             staging_row[dst_x0 as usize / SIMD_WIDTH] = codewords;
         }
         let dst_row = &mut dst[(dst_y * dst_row_width * 8) as usize..];
         let staging_row_bytes = cast_slice(&*staging_row);
         dst_row[..copy_len].copy_from_slice(&staging_row_bytes[..copy_len]);
     }
 }
	// Copyright 2025 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#![feature(portable_simd)]

	// Modules public for testing, don't expect stable API.
	mod cxx;
	pub mod dither;
	pub mod quant;
	pub mod selectors;

	use std::simd::prelude::*;
	use std::simd::Simd;

	use bytemuck::cast_slice;

	use crate::dither::dither;
	use crate::quant::{quantize_averages, QuantResult};
	use crate::selectors::search_table_and_selectors;

	// We primarily compute with 16-bit integers and a width of 8 fills a 128-bit
	// wide lane (SSE, NEON). TODO(b/393494744): When we introduce multiversioning
	// and support for AVX2 etc. this should be converted to a template parameter
	// that varies based on the target architecture.
	const SIMD_WIDTH: usize = 8;
	const HALF_WIDTH: usize = SIMD_WIDTH / 2;
	const QUARTER_WIDTH: usize = SIMD_WIDTH / 4;
	type Reg = Simd<i16, SIMD_WIDTH>;
	type Reg32 = Simd<i32, SIMD_WIDTH>;
	type UReg = Simd<u16, SIMD_WIDTH>;

	/// Define a helper to interleave elements from two vectors, reinterpret
	/// it as a type twice as large, and return the resulting vector.
	/// Each argument / return value is an array of vectors; conceptually, this
	/// represents a vector that is <width> * <len> large; however, since std::simd
	/// types have upper limits on their width we represent them using arrays to be
	/// portable.
	macro_rules! define_interleave {
	($fn_name:ident, $src_ty:ty, $dst_ty:ty, $src_width:expr, $dst_width:expr, $src_len:literal) => {
	fn $fn_name(
	a: [Simd<$src_ty, $src_width>; $src_len],
	b: [Simd<$src_ty, $src_width>; $src_len],
	) -> [Simd<$dst_ty, $dst_width>; $src_len * 2] {
	let mut iter = (0..$src_len).flat_map(\|i\| {
	let (a, b) = a[i].interleave(b[i]);
	[a, b].map(\|x\| bytemuck::cast(x))
	});
	let res = std::array::from_fn(\|_\| iter.next().unwrap());
	assert!(iter.next().is_none());
	res
	}
	};
	}

	/// Convert individual codewords laid out as [15..0, 31..16, 47..32, 63..48]
	/// into interleaved u64 arrays, while flipping the endianness (our internal
	/// representation is little endian while ETC1 requires big endian).
	#[inline]
	pub fn interleave_etc1(regs: [UReg; 4]) -> [Simd<u64, QUARTER_WIDTH>; 4] {
	// The interleaving assumes little endian.
	#[cfg(target_endian = "big")]
	compile_error!("Big endian is not supported");

	define_interleave!(conv_16_to_32, u16, u32, SIMD_WIDTH, HALF_WIDTH, 1);
	define_interleave!(conv_32_to_64, u32, u64, HALF_WIDTH, QUARTER_WIDTH, 2);
	// Step 1: make each u16 codeword big-endian
	let regs = regs.map(\|r\| r.swap_bytes());
	// Step 2: [aaaa, bbbb] to [baba, baba]
	let regs = [conv_16_to_32([regs[1]], [regs[0]]), conv_16_to_32([regs[3]], [regs[2]])];
	// Step 3: [baba, baba], [dcdc, dcdc] to [dcba, dcba], [dcba, dcba]
	let regs = conv_32_to_64(regs[1], regs[0]);
	regs
	}

	/// Load `SIMD_WIDTH` blocks from a region `4*SIMD_WIDTH` wide and `4` tall,
	/// starting at `base_x` and `base_y`.
	///
	/// Out of bounds pixels are padded with mirroring. For example, `abcdxy`
	/// becomes `abcdxyyx`.
	///
	/// Returns a 3D array of SIMD vectors. Each block is mapped to a SIMD lane
	/// (from left to right), and each pixel in the block is accessed as
	/// `[y][x][channel]`.
	#[inline]
	pub fn load_input_block(
	src: &[u32],
	width: u32,
	height: u32,
	row_width: u32,
	base_x: u32,
	base_y: u32,
	) -> [[[Reg; 3]; 4]; 4] {
	let mut data = [[[Reg::default(); 3]; 4]; 4];
	// For now, input load and output store are not vectorized. The main reason is
	// that efficient loading requires shuffling and is poorly supported
	// by std::simd and the wide crate (which we plan to use for
	// supporting stable toolchain). Input load currently accounts for
	// ~20% of the runtime. If shuffle support improves this would be a
	// good candidate for optimization.
	for i in 0..4 {
	for j in 0..4 {
	let mut buf = [0u32; SIMD_WIDTH];
	for block in 0..SIMD_WIDTH as u32 {
	let x = base_x + block * 4 + j as u32;
	let y = base_y + i as u32;
	buf[block as usize] = if x < width && y < height {
	// Fast path: load in-bound pixel
	src[(y * row_width + x) as usize]
	} else {
	// Slow path: mirror out-of-bound pixels
	// If width or height is 1, mirroring can overflow, so make it saturate.
	let xm = if x >= width { (width - 1).saturating_sub(x - width) } else { x };
	let ym = if y >= height { (height - 1).saturating_sub(y - height) } else { y };
	src[(ym * row_width + xm) as usize]
	};
	}
	let rgbx = Simd::from_array(buf);
	let extract_channel = \|x: Simd<u32, SIMD_WIDTH>, shift: u32\| {
	(x >> shift).cast::<i16>() & Simd::splat(0xFF)
	};
	data[i][j][0] = extract_channel(rgbx, 0);
	data[i][j][1] = extract_channel(rgbx, 8);
	data[i][j][2] = extract_channel(rgbx, 16);
	}
	}
	data
	}

	/// Compress RGB pixels to ETC1.
	///
	/// `src` should be in RGBA.
	/// `dst` will be filled with compressed ETC1 blocks.
	/// `width` and `height` does not need to be multiple of 4. The boundary pixels
	/// will be padded with unspecified values.
	/// `src_row_width` and `dst_row_width` specifies the stride, in units of pixels
	/// and blocks, respectively.
	///
	/// Note that `src` assumes aligned 32-bit buffer while `dst` does not. This is
	/// due to two reasons: 32-bit alignment is practical to get even on 32-bit
	/// platforms, whereas 64-bit alignment does not hold on 32-bit ARM.
	/// Additionally, we require extensive shuffling when loading inputs, but
	/// stores to the output straight in the order of pixels. Dealing with
	/// unaligned buffers in the latter case is significantly easier.
	pub fn compress_etc1(
	src: &[u32],
	dst: &mut [u8],
	width: u32,
	height: u32,
	src_row_width: u32,
	dst_row_width: u32,
	) {
	let dst_height = height.div_ceil(4);
	let dst_width = width.div_ceil(4);
	// Aligned staging buffer. Data is copied into the potentially unaligned
	// destination buffer at the end of the each row.
	let mut staging_row = vec![[Simd::splat(0); 4]; (dst_width as usize).div_ceil(SIMD_WIDTH)];
	let copy_len = dst_width as usize * 8;
	// Note on vectorization scheme:
	//
	// We process one 4x4 block per SIMD lane, instead of the more common practice
	// of processing pixels within the same block in parallel using multiple
	// lanes. The one-block-per-lane scheme, more akin to SPMD programming,
	// allows most of our code to be shuffle-free, and works much better with
	// portable SIMD than schemes that heavily shuffles.
	for dst_y in 0..dst_height {
	for dst_x0 in (0..dst_width).step_by(SIMD_WIDTH) {
	let data = load_input_block(src, width, height, src_row_width, dst_x0 * 4, dst_y * 4);

	let data = dither(&data);
	let QuantResult { lo: hdr0, hi: hdr1, scaled0: ep0, scaled1: ep1 } =
	quantize_averages(&data);
	let best_fit = search_table_and_selectors(hdr0, hdr1, &data, [ep0, ep1]);
	let codewords = interleave_etc1(best_fit);
	staging_row[dst_x0 as usize / SIMD_WIDTH] = codewords;
	}
	let dst_row = &mut dst[(dst_y * dst_row_width * 8) as usize..];
	let staging_row_bytes = cast_slice(&*staging_row);
	dst_row[..copy_len].copy_from_slice(&staging_row_bytes[..copy_len]);
	}
	}