use std::arch::aarch64::{ uint16x8_t, uint8x8x3_t, vld3_u8, vmovl_u8, vorrq_u16, vshlq_n_u16, vshrq_n_u16, vst1q_u16 }; #[cfg(target_arch = "aarch64")] pub fn pack_rgb565_neon(rgb_data: &[u8]) -> Vec { let pixels = rgb_data.len() / 3; // allocate as u16 to allow direct SIMD stores let mut packed_u16: Vec = Vec::with_capacity(pixels); let mut src = rgb_data.as_ptr(); let mut dst = packed_u16.as_mut_ptr(); let mut remaining = pixels; unsafe { // process 8 pixels at a time with NEON while remaining >= 8 { let rgb: uint8x8x3_t = vld3_u8(src); let r_u16: uint16x8_t = vmovl_u8(rgb.0); let g_u16: uint16x8_t = vmovl_u8(rgb.1); let b_u16: uint16x8_t = vmovl_u8(rgb.2); let r_shifted = vshrq_n_u16(r_u16, 3); let g_shifted = vshrq_n_u16(g_u16, 2); let b_shifted = vshrq_n_u16(b_u16, 3); let r_bits = vshlq_n_u16(r_shifted, 11); let g_bits = vshlq_n_u16(g_shifted, 5); let packed = vorrq_u16(vorrq_u16(r_bits, g_bits), b_shifted); // Store directly to output buffer vst1q_u16(dst, packed); const SRC_INCREMENT: usize = 8 * 3; src = src.add(SRC_INCREMENT); dst = dst.add(8); remaining -= 8; } // Handle remaining pixels while remaining > 0 { let r = (*src >> 3) as u16; let g = (*src.add(1) >> 2) as u16; let b = (*src.add(2) >> 3) as u16; *dst = (r << 11) | (g << 5) | b; src = src.add(3); dst = dst.add(1); remaining -= 1; } // set length only one time, here near the end packed_u16.set_len(pixels); // Reinterpret as bytes (zero-copy on little-endian) let byte_len = pixels * 2; let byte_cap = packed_u16.capacity() * 2; let byte_ptr = packed_u16.as_mut_ptr() as *mut u8; std::mem::forget(packed_u16); Vec::from_raw_parts(byte_ptr, byte_len, byte_cap) } }